@IEEEtranBSTCTL{bstctl:etal,
  CTLuse_forced_etal = {yes},
  CTLmax_names_forced_etal = {3},
}

@IEEEtranBSTCTL{bstctl:nodash,
  CTLdash_repeated_names = {no},
}

@IEEEtranBSTCTL{bstctl:simpurl,
  CTLname_url_prefix = {Available: },
}

@book{lamport94,
    author    = "Leslie Lamport",
    title     = "{\LaTeX: A Document Preparation System}",
    year      = "1994",
    publisher = "Addison-Wesley",
    edition = "2nd",
    address   = "Reading, Massachusetts"
}

@book{goossens93,
    author    = "Michel Goossens and Frank Mittelbach and Alexander Samarin",
    title     = "The LaTeX Companion",
    year      = "1993",
    publisher = "Addison-Wesley",
    address   = "Reading, Massachusetts"
}

@misc{micro45,
    author = "Chris Fallin and Justin Meza and Vivek Seshadri and Onur Mutlu",
    title = "{MICRO} 2012 Conference Site",
    year = "2012",
    url = "http://www.microsymposia.org/micro45",
}

@ARTICLE{1000GP,
  author = {{1000 Genomes Project Consortium}},
  title = {A map of human genome variation from population-scale sequencing.},
  journal = {Nature},
  year = {2010},
  volume = {467},
  pages = {1061--1073},
  month = {Oct},
  abstract = {The 1000 Genomes Project aims to provide a deep characterization of
	human genome sequence variation as a foundation for investigating
	the relationship between genotype and phenotype. Here we present
	results of the pilot phase of the project, designed to develop and
	compare different strategies for genome-wide sequencing with high-throughput
	platforms. We undertook three projects: low-coverage whole-genome
	sequencing of 179 individuals from four populations; high-coverage
	sequencing of two mother-father-child trios; and exon-targeted sequencing
	of 697 individuals from seven populations. We describe the location,
	allele frequency and local haplotype structure of approximately 15
	million single nucleotide polymorphisms, 1 million short insertions
	and deletions, and 20,000 structural variants, most of which were
	previously undescribed. We show that, because we have catalogued
	the vast majority of common variation, over 95\% of the currently
	accessible variants found in any individual are present in this data
	set. On average, each person is found to carry approximately 250
	to 300 loss-of-function variants in annotated genes and 50 to 100
	variants previously implicated in inherited disorders. We demonstrate
	how these results can be used to inform association and functional
	studies. From the two trios, we directly estimate the rate of de
	novo germline base substitution mutations to be approximately 10(-8)
	per base pair per generation. We explore the data with regard to
	signatures of natural selection, and identify a marked reduction
	of genetic variation in the neighbourhood of genes, due to selection
	at linked sites. These methods and public data will support the next
	phase of human genetic research.},
  
  file = {1000g_pilot:1000GP.pdf:PDF;1000g_pilot_supptext:1000GP-supp.pdf:PDF;1000g_pilot_supptab:1000GP-supp.xls:Excel},
  keywords = {Calibration; Chromosomes, Human, Y, genetics; Computational Biology;
	DNA Mutational Analysis; DNA, Mitochondrial, genetics; Evolution,
	Molecular; Female; Genetic Association Studies; Genetic Variation,
	genetics; Genetics, Population, methods; Genome, Human, genetics;
	Genome-Wide Association Study; Genomics, methods; Genotype; Haplotypes,
	genetics; Humans; Male; Mutation, genetics; Pilot Projects; Polymorphism,
	Single Nucleotide, genetics; Recombination, Genetic, genetics; Sample
	Size; Selection, Genetic, genetics; Sequence Alignment; Sequence
	Analysis, DNA, methods},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {nature09534},
  pmid = {20981092},
  timestamp = {2011.06.17},
}

@ARTICLE{hobbes,
  author = {Athena Ahmadi and Alexander Behm and Nagesh Honnalli and Chen Li
	and Lingjie Weng and Xiaohui Xie},
  title = {Hobbes: optimized gram-based methods for efficient read alignment},
  journal = {Nucleic Acids Research},
  year = {2011},
  volume = {40},
  pages = {e41}
}

@ARTICLE{Ajay2011,
  author = {Subramanian S Ajay and Stephen C J Parker and Hatice Ozel Abaan and
	Karin V Fuentes Fajardo and Elliott H Margulies},
  title = {Accurate and comprehensive sequencing of personal genomes.},
  journal = {Genome Res},
  year = {2011},
  volume = {21},
  pages = {1498--1505},
  month = {Sep},
  __markedentry = {[calkan:6]},
  abstract = {As whole-genome sequencing becomes commoditized and we begin to sequence
	and analyze personal genomes for clinical and diagnostic purposes,
	it is necessary to understand what constitutes a complete sequencing
	experiment for determining genotypes and detecting single-nucleotide
	variants. Here, we show that the current recommendation of ∼30× coverage
	is not adequate to produce genotype calls across a large fraction
	of the genome with acceptably low error rates. Our results are based
	on analyses of a clinical sample sequenced on two related Illumina
	platforms, GAII(x) and HiSeq 2000, to a very high depth (126×). We
	used these data to establish genotype-calling filters that dramatically
	increase accuracy. We also empirically determined how the callable
	portion of the genome varies as a function of the amount of sequence
	data used. These results help provide a "sequencing guide" for future
	whole-genome sequencing decisions and metrics by which coverage statistics
	should be reported.},
  
  file = {Published version:Ajay2011.pdf:PDF},
  institution = {Genome Informatics Section, Genome Technology Branch, National Human
	Genome Research Institute, National Institutes of Health, Bethesda,
	MD 20892, USA.},
  keywords = {Genome, Human; Genomics; Genotype; High-Throughput Nucleotide Sequencing;
	Humans; Polymorphism, Single Nucleotide; Reproducibility of Results;
	Sequence Analysis, DNA},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {gr.123638.111},
  pmid = {21771779},
  timestamp = {2012.05.17},
}

@ARTICLE{Aksay2007,
  author = {Cagri Aksay and Raheleh Salari and Emre Karakoc and Can Alkan and
	S. Cenk Sahinalp},
  title = {{taveRNA}: a web suite for {RNA} algorithms and applications.},
  journal = {Nucleic Acids Res},
  year = {2007},
  volume = {35},
  pages = {W325--W329},
  month = {Jul},
  abstract = {We present taveRNA, a web server package that hosts three RNA web
	services: alteRNA, inteRNA and pRuNA. alteRNA is a new alternative
	for RNA secondary structure prediction. It is based on a dynamic
	programming solution that minimizes the sum of energy density and
	free energy of an RNA structure. inteRNA is the first RNA-RNA interaction
	structure prediction web service. It also employs a dynamic programming
	algorithm to minimize the free energy of the resulting joint structure
	of the two interacting RNAs. Lastly, pRuNA is an efficient database
	pruning service; which given a query RNA, eliminates a significant
	portion of an ncRNA database and returns only a few ncRNAs as potential
	regulators. taveRNA is available at http://compbio.cs.sfu.ca/taverna.},
  
  file = {taverna:Aksay2007.pdf:PDF},
  keywords = {Algorithms; Base Sequence; Computational Biology; Computer Simulation;
	Humans; Internet; Models, Chemical; Models, Statistical; Molecular
	Sequence Data; Nucleic Acid Conformation; Programming Languages;
	RNA; Sequence Alignment; Sequence Analysis, RNA; Software},
  owner = {calkan},
  pii = {gkm303},
  pmid = {17488837},
  timestamp = {2008.10.05},
}

@ARTICLE{ZhangZ2004,
  author = {Zheng Zhang and Scott Schwartz and Lukas Wagner and Webb Miller},
  title = {A Greedy Algorithm for Aligning DNA Sequences},
  journal = {Journal of Computational Biology},
  year = {2004},
  volume = {7},
  month = {Jul},
}

@ARTICLE{Rasmussen2006,
  author = {Kim R. Rasmussen and Jens Stoye and Eugene W. Myers},
  title = {Efficient q-Gram Filters for Finding All e-Matches over a Given Length},
  journal = {Journal of Computational Biology},
  year = {2006},
  volume = {13},
  month = {Apr},
}

@ARTICLE{Albers2011,
  author = {Cornelis A Albers and Gerton Lunter and Daniel G MacArthur and Gilean
    McVean and Willem H Ouwehand and Richard Durbin},
  title = {Dindel: accurate indel calls from short-read data.},
  journal = {Genome Res},
  year = {2011},
  volume = {21},
  pages = {961--973},
  month = {Jun},
  abstract = {Small insertions and deletions (indels) are a common and functionally
	important type of sequence polymorphism. Most of the focus of studies
	of sequence variation is on single nucleotide variants (SNVs) and
	large structural variants. In principle, high-throughput sequencing
	studies should allow identification of indels just as SNVs. However,
	inference of indels from next-generation sequence data is challenging,
	and so far methods for identifying indels lag behind methods for
	calling SNVs in terms of sensitivity and specificity. We propose
	a Bayesian method to call indels from short-read sequence data in
	individuals and populations by realigning reads to candidate haplotypes
	that represent alternative sequence to the reference. The candidate
	haplotypes are formed by combining candidate indels and SNVs identified
	by the read mapper, while allowing for known sequence variants or
	candidates from other methods to be included. In our probabilistic
	realignment model we account for base-calling errors, mapping errors,
	and also, importantly, for increased sequencing error indel rates
	in long homopolymer runs. We show that our method is sensitive and
	achieves low false discovery rates on simulated and real data sets,
	although challenges remain. The algorithm is implemented in the program
	Dindel, which has been used in the 1000 Genomes Project call sets.},
  
  file = {main:Albers2011.pdf:PDF},
  institution = {Wellcome Trust Sanger Institute, Hinxton, Cambridgeshire CB10 1HH,
	United Kingdom. caa@sanger.ac.uk},
  keywords = {Algorithms; Bayes Theorem; Haplotypes, genetics; INDEL Mutation, genetics;
	Likelihood Functions; Models, Genetic; Sequence Analysis, DNA, methods;
	Software},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {gr.112326.110},
  pmid = {20980555},
  timestamp = {2012.03.08},
}

@ARTICLE{Alexandrov1991,
  author = {I. A. Alexandrov and T. D. Mashkova and T. A. Akopian and L. I. Medvedev
	and L. L. Kisselev and S. P. Mitkevich and Y. B. Yurov},
  title = {Chromosome-specific alpha satellites: two distinct families on human
	chromosome 18.},
  journal = {Genomics},
  year = {1991},
  volume = {11},
  pages = {15--23},
  month = {Sep},
  abstract = {Two types of human chromosome 18-specific alpha satellite fragments
	have been cloned and sequenced. They represent closely related but
	distinct alphoid families formed by two different types of the higher-order
	repeated units (1360-bp EcoRI and 1700-bp HindIII fragments) that
	do not alternate in the genome. The individual repeats within each
	family are 99\% identical and interfamily homology is about 78\%.
	Sequence analysis shows that both repeats belong to alphoid suprachromosomal
	family 2, but their homology is not higher than that of family members
	located on different chromosomes. Therefore, the two repeats shared
	a common origin in the recent past, although they are not the direct
	offspring of one ancestral sequence. Our data indicate that these
	two 18-specific domains have appeared as a result of two separate
	amplification events. Despite the high degree of homology, they are
	not undergoing intrachromosomal homogenization, although some variation
	of this process might take place within each domain.},
  file = {main:Alexandrov1991.pdf:PDF},
  institution = {All-Union Research Center of Mental Health, Moscow, USSR.},
  keywords = {Base Sequence; Chromosomes, Human, Pair 18; DNA, Satellite, genetics;
	Humans; Molecular Sequence Data; Nucleic Acid Hybridization; Repetitive
	Sequences, Nucleic Acid; Sequence Homology, Nucleic Acid},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {0888-7543(91)90097-X},
  pmid = {1765373},
  timestamp = {2011.07.19},
}

@ARTICLE{Alkan2002,
  author = {Can Alkan and Jeffrey A Bailey and Evan E Eichler and S. Cenk Sahinalp
	and Eray Tuzun},
  title = {An algorithmic analysis of the role of unequal crossover in alpha-satellite
	{DNA} evolution.},
  journal = {Genome Inform},
  year = {2002},
  volume = {13},
  pages = {93--102},
  abstract = {Human DNA consists of a large number of tandem repeat sequences. Such
	sequences are usually called satellites, with the primary example
	being the centromeric alpha-satellite DNA. The basic repeat unit
	of the alpha-satellite DNA is a 171 bp monomer. However, with the
	exception of peripheral alpha-satellite DNA, monomers can be grouped
	into blocks of k-monomers (4 < k < 20) between which the divergence
	rate is much smaller (e.g. 5\%). Perhaps the simplest and best understood
	mechanism for tandem repeat array evolution is the unequal crossover.
	Although it is possible that the alpha-satellite sequence developed
	as a result of subsequent unequal crossovers only, no formal computational
	framework seems to have been developed to verify this possibility.
	In this paper we develop such a framework and perform experiments
	which seem to indicate that pericentromeric alpha-satellite segments
	(which are devoid of higher-order structure) are evolutionarily distinct
	from the higher-order repeat segments. It is likely that the higher
	order repeats developed independently in distinct regions of the
	genome and were carried into their current locations through an unknown
	mechanism of transposition.},
  file = {main:Alkan2002.pdf:PDF},
  institution = {Department of EECS, Center for Computational Genomics, Case Western
	Reserve University, 10900 Euclid Ave., Cleveland, OH 44106, USA.
	cxa27@eecs.cwru.edu},
  keywords = {Algorithms; Computational Biology, methods; Crossing Over, Genetic;
	DNA, Satellite; Evolution, Molecular; Humans; Phylogeny},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pmid = {14571378},
  timestamp = {2011.07.19},
}

@ARTICLE{Alkan2011d,
  author = {Can Alkan and Maria Francesca Cardone and Claudia Rita Catacchio
	and Francesca Antonacci and Stephen J O'Brien and Oliver A Ryder
	and Stefania Purgato and Monica Zoli and Giuliano Della Valle and
	Evan E Eichler and Mario Ventura},
  title = {Genome-wide characterization of centromeric satellites from multiple
	mammalian genomes.},
  journal = {Genome Res},
  year = {2011},
  volume = {21},
  pages = {137--145},
  month = {Jan},
  abstract = {Despite its importance in cell biology and evolution, the centromere
	has remained the final frontier in genome assembly and annotation
	due to its complex repeat structure. However, isolation and characterization
	of the centromeric repeats from newly sequenced species are necessary
	for a complete understanding of genome evolution and function. In
	recent years, various genomes have been sequenced, but the characterization
	of the corresponding centromeric DNA has lagged behind. Here, we
	present a computational method (RepeatNet) to systematically identify
	higher-order repeat structures from unassembled whole-genome shotgun
	sequence and test whether these sequence elements correspond to functional
	centromeric sequences. We analyzed genome datasets from six species
	of mammals representing the diversity of the mammalian lineage, namely,
	horse, dog, elephant, armadillo, opossum, and platypus. We define
	candidate monomer satellite repeats and demonstrate centromeric localization
	for five of the six genomes. Our analysis revealed the greatest diversity
	of centromeric sequences in horse and dog in contrast to elephant
	and armadillo, which showed high-centromeric sequence homogeneity.
	We could not isolate centromeric sequences within the platypus genome,
	suggesting that centromeres in platypus are not enriched in satellite
	DNA. Our method can be applied to the characterization of thousands
	of other vertebrate genomes anticipated for sequencing in the near
	future, providing an important tool for annotation of centromeres.},
  
  file = {main:Alkan2011d.pdf:PDF;supp_figs:Alkan2011d-suppfigs.pdf:PDF;supp_legends:Alkan2011d-supplegends.pdf:PDF;supp_tab1:Alkan2011d-tabS1.xls:Excel;supp_tab2:Alkan2011d-tabS2.pdf:PDF;supp_tab3:Alkan2011d-tabS3.pdf:PDF;supp_tab4:Alkan2011d-tabS4.pdf:PDF},
  institution = {Department of Genome Sciences, Howard Hughes Medical Institute, University
	of Washington School of Medicine, Seattle, Washington 98195, USA.},
  keywords = {Animals; Armadillos, genetics; Base Sequence; Centromere, genetics;
	Computational Biology, methods; Consensus Sequence; DNA, Satellite,
	genetics; Dogs, genetics; Elephants, genetics; Genome, genetics;
	Horses, genetics; Mammals, classification/genetics; Molecular Sequence
	Data; Opossums, genetics; Platypus, genetics; Sequence Analysis,
	DNA; Species Specificity},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {gr.111278.110},
  pmid = {21081712},
  timestamp = {2011.06.17},
}

@ARTICLE{Alkan2011nrgreview,
  author = {Can Alkan and Bradley P Coe and Evan E Eichler},
  title = {Genome structural variation discovery and genotyping.},
  journal = {Nat Rev Genet},
  year = {2011},
  volume = {12},
  pages = {363--376},
  month = {May},
  abstract = {Comparisons of human genomes show that more base pairs are altered
	as a result of structural variation - including copy number variation
	- than as a result of point mutations. Here we review advances and
	challenges in the discovery and genotyping of structural variation.
	The recent application of massively parallel sequencing methods has
	complemented microarray-based methods and has led to an exponential
	increase in the discovery of smaller structural-variation events.
	Some global discovery biases remain, but the integration of experimental
	and computational approaches is proving fruitful for accurate characterization
	of the copy, content and structure of variable regions. We argue
	that the long-term goal should be routine, cost-effective and high
	quality de novo assembly of human genomes to comprehensively assess
	all classes of structural variation.},
  
  file = {main:Alkan2011.pdf:PDF},
  institution = {Department of Genome Sciences, University of Washington School of
	Medicine, Foege S413C, 3720 15th Ave NE, Seattle, Washington, USA.},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {nrg2958},
  pmid = {21358748},
  timestamp = {2011.06.17},
}

@ARTICLE{Alkan2004,
  author = {Can Alkan and Evan E Eichler and Jeffrey A Bailey and S. Cenk Sahinalp
	and Eray Tüzün},
  title = {The role of unequal crossover in alpha-satellite {DNA} evolution:
	a computational analysis.},
  journal = {J Comput Biol},
  year = {2004},
  volume = {11},
  pages = {933--944},
  abstract = {Human DNA consists of a large number of tandem repeat sequences. Such
	sequences are usually called satellites, with the primary example
	being the centromeric alpha-satellite DNA. The basic repeat unit
	of the alpha-satellite DNA is a 171 bp monomer. Arbitrary monomer
	pairs usually have considerable sequence divergence (20-40\%). However,
	with the exception of peripheral alpha-satellite DNA, monomers can
	be grouped into blocks of k-monomers (4 < or = k < or = 20) between
	which the divergence rate is much smaller (e.g., 5\%). Perhaps the
	simplest and best understood mechanism for tandem repeat array evolution
	is unequal crossover. Although it is possible that alpha-satellite
	sequences developed as a result of subsequent unequal crossovers
	only, no formal computational framework seems to have been developed
	to verify this possibility. In this paper, we develop such a framework
	and report on experiments which imply that pericentromeric alpha-satellite
	segments (which are devoid of higher order structure) are evolutionarily
	distinct from the higher order repeat segments. It is likely that
	the higher order repeats developed independently in distinct regions
	of the genome and were carried into their current locations through
	an unknown mechanism of transposition.},
  file = {main:Alkan2004.pdf:PDF},
  institution = {Department of EECS, Case Western Reserve University, Cleveland, OH
	44106, USA.},
  keywords = {Algorithms; Computational Biology; Crossing Over, Genetic; DNA, Satellite;
	Data Interpretation, Statistical; Evolution, Molecular; Phylogeny},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pmid = {15700410},
  timestamp = {2011.07.19},
}

@ARTICLE{mrFast,
  author = {Can Alkan and et al.},
  title = {mrsFAST: a cache-oblivious algorithm for short-read mapping},
  year = {2010},
  institution = {Natura Methods}
}

@INPROCEEDINGS{Alkan2005,
  author = {C. Alkan and E. Karakoc and J. H. Nadeau and S. C. Sahinalp and K.
	Zhang},
  title = {{RNA-RNA} Interaction Prediction and Antisense {RNA} Target Search},
  booktitle = {Proc. Ninth Annual International Conference on Research in Computational
	Molecular Biology},
  year = {2005},
  pages = {152--171},
  address = {Cambridge, MA, USA},
  month = {May 14-18},
  file = {main:Alkan2005.pdf:PDF},
  owner = {calkan},
  timestamp = {2011.07.20},
}

@INPROCEEDINGS{Alkan2006a,
  author = {Can Alkan and Emre Karakoc and S. Cenk Sahinalp and Peter Unrau and
	H. Alexander Ebhardt and Kaizhong Zhang and Jeremy Buhler},
  title = {{RNA} Secondary Structure Prediction via Energy Density Minimization},
  booktitle = {Proc. of the Tenth Annual International Conference on Research in
	Computational Molecular Biology},
  year = {2006},
  volume = {LNBI 3909},
  pages = {130-142},
  address = {Venice, Italy},
  month = {April 2-5},
  organization = {RECOMB},
  publisher = {Springer-Verlag},
  file = {main:Alkan2006a.pdf:PDF},
  owner = {calkan},
  timestamp = {2007.05.11},
}

@ARTICLE{Alkan2006,
  author = {Can Alkan and Emre Karakoç and Joseph H Nadeau and S. Cenk Sahinalp
	and Kaizhong Zhang},
  title = {{RNA-RNA} interaction prediction and antisense {RNA} target search.},
  journal = {J Comput Biol},
  year = {2006},
  volume = {13},
  pages = {267--282},
  month = {Mar},
  abstract = {Recent studies demonstrating the existence of special noncoding "antisense"
	RNAs used in post transcriptional gene regulation have received considerable
	attention. These RNAs are synthesized naturally to control gene expression
	in C. elegans, Drosophila, and other organisms; they are known to
	regulate plasmid copy numbers in E. coli as well. Small RNAs have
	also been artificially constructed to knock out genes of interest
	in humans and other organisms for the purpose of finding out more
	about their functions. Although there are a number of algorithms
	for predicting the secondary structure of a single RNA molecule,
	no such algorithm exists for reliably predicting the joint secondary
	structure of two interacting RNA molecules or measuring the stability
	of such a joint structure. In this paper, we describe the RNA-RNA
	interaction prediction (RIP) problem between an antisense RNA and
	its target mRNA and develop efficient algorithms to solve it. Our
	algorithms minimize the joint free energy between the two RNA molecules
	under a number of energy models with growing complexity. Because
	the computational resources needed by our most accurate approach
	is prohibitive for long RNA molecules, we also describe how to speed
	up our techniques through a number of heuristic approaches while
	experimentally maintaining the original accuracy. Equipped with this
	fast approach, we apply our method to discover targets for any given
	antisense RNA in the associated genome sequence.},
  
  file = {main:Alkan2006.pdf:PDF},
  institution = {Department of Genome Sciences, University of Washington, Seattle,
	98195, USA.},
  keywords = {Adenosine Triphosphatases, chemistry/genetics; Algorithms; Base Sequence;
	Binding Sites; Cation Transport Proteins, chemistry/genetics; Computational
	Biology; Escherichia coli Proteins, chemistry/genetics; Escherichia
	coli, genetics; Genome, Bacterial; Molecular Sequence Data; Nucleic
	Acid Conformation; Plasmids; RNA Stability; RNA, Antisense, genetics;
	RNA, Bacterial, chemistry/genetics/metabolism; Trans-Activators,
	chemistry/genetics},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pmid = {16597239},
  timestamp = {2011.07.19},
}

@ARTICLE{Alkan2009,
  author = {Can Alkan and Jeffrey M Kidd and Tomas Marques-Bonet and Gozde Aksay
    and Francesca Antonacci and Fereydoun Hormozdiari and Jacob O Kitzman
    and Carl Baker and Maika Malig and Onur Mutlu and S. Cenk Sahinalp
    and Richard A Gibbs and Evan E Eichler},
  title = {Personalized copy number and segmental duplication maps using next-generation
	sequencing.},
  journal = {Nat Genet},
  year = {2009},
  volume = {41},
  pages = {1061--1067},
  month = {Oct},
  abstract = {Despite their importance in gene innovation and phenotypic variation,
	duplicated regions have remained largely intractable owing to difficulties
	in accurately resolving their structure, copy number and sequence
	content. We present an algorithm (mrFAST) to comprehensively map
	next-generation sequence reads, which allows for the prediction of
	absolute copy-number variation of duplicated segments and genes.
	We examine three human genomes and experimentally validate genome-wide
	copy number differences. We estimate that, on average, 73-87 genes
	vary in copy number between any two individuals and find that these
	genic differences overwhelmingly correspond to segmental duplications
	(odds ratio = 135; P < 2.2 x 10(-16)). Our method can distinguish
	between different copies of highly identical genes, providing a more
	accurate assessment of gene content and insight into functional constraint
	without the limitations of array-based technology.},
  
  file = {main:Alkan2009.pdf:PDF;supp_text:Alkan2009-supp.pdf:PDF;supp_tab4:Alkan2009-suptab4.xls:Excel;supp_tab5:Alkan2009-suptab5.xls:Excel},
  institution = {Department of Genome Sciences, University of Washington School of
	Medicine, Seattle, Washington, USA.},
  keywords = {Algorithms; Chromosome Mapping; DNA; Gene Dosage; Gene Duplication;
	Genome, Human; Genomic Library; Humans; Polymorphism, Genetic; Sequence
	Analysis, DNA},
  owner = {calkan},
  pii = {ng.437},
  pmid = {19718026},
  timestamp = {2009.10.25},
}

@ARTICLE{Alkan2011c,
  author = {Can Alkan and Saba Sajjadian and Evan E Eichler},
  title = {Limitations of next-generation genome sequence assembly.},
  journal = {Nat Methods},
  year = {2011},
  volume = {8},
  pages = {61--65},
  month = {Jan},
  abstract = {High-throughput sequencing technologies promise to transform the fields
	of genetics and comparative biology by delivering tens of thousands
	of genomes in the near future. Although it is feasible to construct
	de novo genome assemblies in a few months, there has been relatively
	little attention to what is lost by sole application of short sequence
	reads. We compared the recent de novo assemblies using the short
	oligonucleotide analysis package (SOAP), generated from the genomes
	of a Han Chinese individual and a Yoruban individual, to experimentally
	validated genomic features. We found that de novo assemblies were
	16.2\% shorter than the reference genome and that 420.2 megabase
	pairs of common repeats and 99.1\% of validated duplicated sequences
	were missing from the genome. Consequently, over 2,377 coding exons
	were completely missing. We conclude that high-quality sequencing
	approaches must be considered in conjunction with high-throughput
	sequencing for comparative genomics analyses and studies of genome
	evolution.},
  
  file = {main:Alkan2011c.pdf:PDF;supp_text:Alkan2011c-supp.pdf:PDF;supp_tab1:Alkan2011c-suptab1.xls:Excel;supp_tab3:Alkan2011c-suptab3.xls:Excel;supp_tab4:Alkan2011c-suptab4.xlsx:Excel;supp_tab5:Alkan2011c-suptab5.xls:Excel},
  institution = {Department of Genome Sciences, University of Washington School of
	Medicine and Howard Hughes Medical Institute, Seattle, Washington,
	USA.},
  keywords = {Algorithms; Base Sequence; Evolution, Molecular; Genome, Human, genetics;
	Genomics, economics/methods/trends; Humans; Sequence Analysis, DNA,
	economics/methods/standards/trends},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {nmeth.1527},
  pmid = {21102452},
  timestamp = {2011.06.17},
}

@ARTICLE{Alkan2005b,
  author = {Can Alkan and Eray Tüzün and Jerome Buard and Franck Lethiec and
	Evan E Eichler and Jeffrey A Bailey and S. Cenk Sahinalp},
  title = {Manipulating multiple sequence alignments via {MaM} and {WebMaM}.},
  journal = {Nucleic Acids Res},
  year = {2005},
  volume = {33},
  pages = {W295--W298},
  month = {Jul},
  abstract = {MaM is a software tool that processes and manipulates multiple alignments
	of genomic sequence. MaM computes the exact location of common repeat
	elements, exons and unique regions within aligned genomics sequences
	using a variety of user identified programs, databases and/or tables.
	The program can extract subalignments, corresponding to these various
	regions of DNA to be analyzed independently or in conjunction with
	other elements of genomic DNA. Graphical displays further allow an
	assessment of sequence variation throughout these different regions
	of the aligned sequence, providing separate displays for their repeat,
	non-repeat and coding portions of genomic DNA. The program should
	facilitate the phylogenetic analysis and processing of different
	portions of genomic sequence as part of large-scale sequencing efforts.
	MaM source code is freely available for non-commercial use at http://compbio.cs.sfu.ca/MAM.htm;
	and the web interface WebMaM is hosted at http://atgc.lirmm.fr/mam.},
  
  file = {main:Alkan2005b.pdf:PDF},
  institution = {Department of EECS, Case Western Reserve University, Cleveland, OH,
	USA.},
  keywords = {Exons; Genomics; Internet; Phylogeny; Repetitive Sequences, Nucleic
	Acid; Sequence Alignment; Software},
  owner = {calkan},
  pii = {33/suppl_2/W295},
  pmid = {15980474},
  timestamp = {2009.09.18},
}

@ARTICLE{Alkan2007,
  author = {Can Alkan and Mario Ventura and Nicoletta Archidiacono and Mariano
    Rocchi and S. Cenk Sahinalp and Evan E Eichler},
  title = {Organization and evolution of primate centromeric {DNA} from whole-genome
	shotgun sequence data.},
  journal = {PLoS Comput Biol},
  year = {2007},
  volume = {3},
  pages = {1807--1818},
  month = {Sep},
  abstract = {The major DNA constituent of primate centromeres is alpha satellite
	DNA. As much as 2\%-5\% of sequence generated as part of primate
	genome sequencing projects consists of this material, which is fragmented
	or not assembled as part of published genome sequences due to its
	highly repetitive nature. Here, we develop computational methods
	to rapidly recover and categorize alpha-satellite sequences from
	previously uncharacterized whole-genome shotgun sequence data. We
	present an algorithm to computationally predict potential higher-order
	array structure based on paired-end sequence data and then experimentally
	validate its organization and distribution by experimental analyses.
	Using whole-genome shotgun data from the human, chimpanzee, and macaque
	genomes, we examine the phylogenetic relationship of these sequences
	and provide further support for a model for their evolution and mutation
	over the last 25 million years. Our results confirm fundamental differences
	in the dispersal and evolution of centromeric satellites in the Old
	World monkey and ape lineages of evolution.},
  
  file = {main:Alkan2007.pdf:PDF;figS1:Alkan2007-figS1.ps:PostScript;figS2:Alkan2007-figS2.ps:PostScript;figS3:Alkan2007-figS3.jpg:JPG image;figS4:Alkan2007-figS4.ps:PostScript;figS5:Alkan2007-figS5.ps:PostScript;tabS1:Alkan2007-tabS1.xls:Excel;tabS2:Alkan2007-tabS2.xls:Excel;tabS3:Alkan2007-tabS3.xls:Excel;tabS4:Alkan2007-tabS4.xls:Excel;tabS5:Alkan2007-tabS5.xls:Excel},
  keywords = {Animals; Base Sequence; Centromere; Chromosome Mapping; DNA, Satellite;
	Evolution; Evolution, Molecular; Genome; Humans; Macaca; Molecular
	Sequence Data; Pan troglodytes; Sequence Analysis, DNA},
  owner = {calkan},
  pii = {07-PLCB-RA-0242},
  pmid = {17907796},
  timestamp = {2008.02.06},
}

@ARTICLE{Alon2008,
  author = {Noga Alon and Phuong Dao and Iman Hajirasouliha and Fereydoun Hormozdiari
	and S. Cenk Sahinalp},
  title = {Biomolecular network motif counting and discovery by color coding.},
  journal = {Bioinformatics},
  year = {2008},
  volume = {24},
  pages = {i241--i249},
  month = {Jul},
  abstract = {Protein-protein interaction (PPI) networks of many organisms share
	global topological features such as degree distribution, k-hop reachability,
	betweenness and closeness. Yet, some of these networks can differ
	significantly from the others in terms of local structures: e.g.
	the number of specific network motifs can vary significantly among
	PPI networks. Counting the number of network motifs provides a major
	challenge to compare biomolecular networks. Recently developed algorithms
	have been able to count the number of induced occurrences of subgraphs
	with k < or = 7 vertices. Yet no practical algorithm exists for counting
	non-induced occurrences, or counting subgraphs with k > or = 8 vertices.
	Counting non-induced occurrences of network motifs is not only challenging
	but also quite desirable as available PPI networks include several
	false interactions and miss many others. In this article, we show
	how to apply the 'color coding' technique for counting non-induced
	occurrences of subgraph topologies in the form of trees and bounded
	treewidth subgraphs. Our algorithm can count all occurrences of motif
	G' with k vertices in a network G with n vertices in time polynomial
	with n, provided k = O(log n). We use our algorithm to obtain 'treelet'
	distributions for k < or = 10 of available PPI networks of unicellular
	organisms (Saccharomyces cerevisiae Escherichia coli and Helicobacter
	Pyloris), which are all quite similar, and a multicellular organism
	(Caenorhabditis elegans) which is significantly different. Furthermore,
	the treelet distribution of the unicellular organisms are similar
	to that obtained by the 'duplication model' but are quite different
	from that of the 'preferential attachment model'. The treelet distribution
	is robust w.r.t. sparsification with bait/edge coverage of 70\% but
	differences can be observed when bait/edge coverage drops to 50\%.},
  
  file = {main:Alon2008.pdf:PDF},
  institution = {School of Mathematical Sciences, Tel Aviv University, Ramat Aviv,
	Israel.},
  keywords = {Algorithms; Color; Computer Simulation; Models, Biological; Protein
	Interaction Mapping, methods; Proteome, metabolism; Signal Transduction,
	physiology},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {btn163},
  pmid = {18586721},
  timestamp = {2010.09.15},
}

@ARTICLE{blast,
  author = {Stephen F. Altschul and Warren Gish and Webb Miller and Eugene W.
	Myers and David J. Lipman},
  title = {Basic local alignment search tool},
  journal = {Journal of Molecular Biology},
  year = {1990},
  volume = {215},
  pages = {403--410},
}

@ARTICLE{Antonacci2010,
  author = {Francesca Antonacci and Jeffrey M Kidd and Tomas Marques-Bonet and
	others},
  title = {A large and complex structural polymorphism at 16p12.1 underlies
	microdeletion disease risk.},
  journal = {Nat Genet},
  year = {2010},
  volume = {42},
  pages = {745--750},
  month = {Sep},
  abstract = {There is a complex relationship between the evolution of segmental
	duplications and rearrangements associated with human disease. We
	performed a detailed analysis of one region on chromosome 16p12.1
	associated with neurocognitive disease and identified one of the
	largest structural inconsistencies in the human reference assembly.
	Various genomic analyses show that all examined humans are homozygously
	inverted relative to the reference genome for a 1.1-Mb region on
	16p12.1. We determined that this assembly discrepancy stems from
	two common structural configurations with worldwide frequencies of
	17.6\% (S1) and 82.4\% (S2). This polymorphism arose from the rapid
	integration of segmental duplications, precipitating two local inversions
	within the human lineage over the last 10 million years. The two
	human haplotypes differ by 333 kb of additional duplicated sequence
	present in S2 but not in S1. Notably, we show that the S2 configuration
	harbors directly oriented duplications, specifically predisposing
	this chromosome to disease-associated rearrangement.},
  
  file = {main:Antonacci2010.pdf:PDF;supp_note:Antonacci2010-supp.pdf:PDF},
  institution = {Department of Genome Sciences, University of Washington, Seattle,
	Washington, USA.},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {ng.643},
  pmid = {20729854},
  timestamp = {2010.09.15},
}

@ARTICLE{Antonacci2009,
  author = {Francesca Antonacci and Jeffrey M Kidd and Tomas Marques-Bonet and
	others},
  title = {Characterization of six human disease-associated inversion polymorphisms.},
  journal = {Hum Mol Genet},
  year = {2009},
  volume = {18},
  pages = {2555--2566},
  month = {Jul},
  abstract = {The human genome is a highly dynamic structure that shows a wide range
	of genetic polymorphic variation. Unlike other types of structural
	variation, little is known about inversion variants within normal
	individuals because such events are typically balanced and are difficult
	to detect and analyze by standard molecular approaches. Using sequence-based,
	cytogenetic and genotyping approaches, we characterized six large
	inversion polymorphisms that map to regions associated with genomic
	disorders with complex segmental duplications mapping at the breakpoints.
	We developed a metaphase FISH-based assay to genotype inversions
	and analyzed the chromosomes of 27 individuals from three HapMap
	populations. In this subset, we find that these inversions are less
	frequent or absent in Asians when compared with European and Yoruban
	populations. Analyzing multiple individuals from outgroup species
	of great apes, we show that most of these large inversion polymorphisms
	are specific to the human lineage with two exceptions, 17q21.31 and
	8p23 inversions, which are found to be similarly polymorphic in other
	great ape species and where the inverted allele represents the ancestral
	state. Investigating linkage disequilibrium relationships with genotyped
	SNPs, we provide evidence that most of these inversions appear to
	have arisen on at least two different haplotype backgrounds. In these
	cases, discovery and genotyping methods based on SNPs may be confounded
	and molecular cytogenetics remains the only method to genotype these
	inversions.},
  
  file = {main:Antonacci2009.pdf:PDF;supp_note:Antonacci2009-supp.pdf:PDF;supp_tables:Antonacci2009-supptables.xls:Excel},
  institution = {Department of Genome Sciences, Howard Hughes Medical Institute, University
	of Washington, Seattle, WA 98195, USA.},
  keywords = {Animals; Chromosome Inversion; Chromosome Mapping; Chromosomes, Human,
	Pair 17, genetics; Chromosomes, Human, Pair 8, genetics; Continental
	Population Groups, genetics; Disease, genetics; Evolution, Molecular;
	Haplotypes; Humans; Linkage Disequilibrium; Polymorphism, Genetic},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {ddp187},
  pmid = {19383631},
  timestamp = {2010.09.15},
}

@MISC{solid,
  author = {{Applied Biosystems}},
  title = {{The Applied Biosystems SOLiD System}},
  howpublished = {\url{http://http://marketing.appliedbiosystems.com/images/Product/Solid_Knowledge/flash/102207/solid.html/}}
}

@ARTICLE{Bailey2004a,
  author = {Jeffrey A Bailey and Robert Baertsch and W. James Kent and David
	Haussler and Evan E Eichler},
  title = {Hotspots of mammalian chromosomal evolution.},
  journal = {Genome Biol},
  year = {2004},
  volume = {5},
  pages = {R23},
  abstract = {Chromosomal evolution is thought to occur through a random process
	of breakage and rearrangement that leads to karyotype differences
	and disruption of gene order. With the availability of both the human
	and mouse genomic sequences, detailed analysis of the sequence properties
	underlying these breakpoints is now possible.We report an abundance
	of primate-specific segmental duplications at the breakpoints of
	syntenic blocks in the human genome. Using conservative criteria,
	we find that 25\% (122/461) of all breakpoints contain > or = 10
	kb of duplicated sequence. This association is highly significant
	(p < 0.0001) when compared to a simulated random-breakage model.
	The significance is robust under a variety of parameters, multiple
	sets of conserved synteny data, and for orthologous breakpoints between
	and within chromosomes. A comparison of mouse lineage-specific breakpoints
	since the divergence of rat and mouse showed a similar association
	with regions associated with segmental duplications in the primate
	genome.These results indicate that segmental duplications are associated
	with syntenic rearrangements, even when pericentromeric and subtelomeric
	regions are excluded. However, segmental duplications are not necessarily
	the cause of the rearrangements. Rather, our analysis supports a
	nonrandom model of chromosomal evolution that implicates specific
	regions within the mammalian genome as having been predisposed to
	both recurrent small-scale duplication and large-scale evolutionary
	rearrangements.},
  
  institution = {Department of Genetics, Center for Computational Genomics, Case Western
	Reserve University School of Medicine and University Hospitals of
	Cleveland, Cleveland, OH 44106, USA.},
  keywords = {Animals; Chromosome Breakage, genetics; Chromosome Mapping, methods;
	Chromosomes, Human, genetics; Chromosomes, genetics; Evolution, Molecular;
	Gene Duplication; Genome; Genome, Human; Gorilla gorilla, genetics;
	Humans; Mice; Pan troglodytes, genetics; Rats; Synteny, genetics},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {gb-2004-5-4-r23},
  pmid = {15059256},
  timestamp = {2011.07.19},
}

@ARTICLE{Bailey2004,
  author = {Jeffrey A Bailey and Deanna M Church and Mario Ventura and Mariano
	Rocchi and Evan E Eichler},
  title = {Analysis of segmental duplications and genome assembly in the mouse.},
  journal = {Genome Res},
  year = {2004},
  volume = {14},
  pages = {789--801},
  month = {May},
  abstract = {Limited comparative studies suggest that the human genome is particularly
	enriched for recent segmental duplications. The extent of segmental
	duplications in other mammalian genomes is unknown and confounded
	by methodological differences in genome assembly. Here, we present
	a detailed analysis of recent duplication content within the mouse
	genome using a whole-genome assembly comparison method and a novel
	assembly independent method, designed to take advantage of the reduced
	allelic variation of the C57BL/6J strain. We conservatively estimate
	that approximately 57\% of all highly identical segmental duplications
	(>or=90\%) were misassembled or collapsed within the working draft
	WGS assembly. The WGS approach often leaves duplications fragmented
	and unassigned to a chromosome when compared with the clone-ordered-based
	approach. Our preliminary analysis suggests that 1.7\%-2.0\% of the
	mouse genome is part of recent large segmental duplications (about
	half of what is observed for the human genome). We have constructed
	a mouse segmental duplication database to aid in the characterization
	of these regions and their integration into the final mouse genome
	assembly. This work suggests significant biological differences in
	the architecture of recent segmental duplications between human and
	mouse. In addition, our unique method provides the means for improving
	whole-genome shotgun sequence assembly of mouse and future mammalian
	genomes.},
  
  institution = {Department of Genetics, Center for Computational Genomics, Case Western
	Reserve University School of Medicine and University Hospitals of
	Cleveland, Cleveland, Ohio 4410, USA.},
  keywords = {Animals; Base Composition, genetics; Bone Marrow Cells, chemistry/metabolism;
	Cell Nucleus, genetics; Chromosome Mapping, methods/standards/statistics
	/&/ numerical data; Computational Biology, methods; DNA, genetics;
	Databases, Genetic; Gene Duplication; Genes; Genome; Genome, Human;
	Humans; In Situ Hybridization, Fluorescence, methods; Mice; Mice,
	Inbred C57BL; Sequence Alignment, methods; Sequence Analysis, DNA,
	methods},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {14/5/789},
  pmid = {15123579},
  timestamp = {2011.07.19},
}

@ARTICLE{Bailey2006,
  author = {Jeffrey A Bailey and Evan E Eichler},
  title = {Primate segmental duplications: crucibles of evolution, diversity
	and disease.},
  journal = {Nat Rev Genet},
  year = {2006},
  volume = {7},
  pages = {552--564},
  month = {Jul},
  abstract = {Compared with other mammals, the genomes of humans and other primates
	show an enrichment of large, interspersed segmental duplications
	(SDs) with high levels of sequence identity. Recent evidence has
	begun to shed light on the origin of primate SDs, pointing to a complex
	interplay of mechanisms and indicating that distinct waves of duplication
	took place during primate evolution. There is also evidence for a
	strong association between duplication, genomic instability and large-scale
	chromosomal rearrangements. Exciting new findings suggest that SDs
	have not only created novel primate gene families, but might have
	also influenced current human genic and phenotypic variation on a
	previously unappreciated scale. A growing number of examples link
	natural human genetic variation of these regions to susceptibility
	to common disease.},
  
  institution = {Department of Pathology, Case Western University School of Medicine
	and University Hospitals of Cleveland, Ohio 44106, USA.},
  keywords = {Animals; Evolution, Molecular; Gene Duplication; Genetic Diseases,
	Inborn, genetics; Genetic Variation; Humans; Primates, genetics},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {nrg1895},
  pmid = {16770338},
  timestamp = {2011.07.19},
}

@ARTICLE{Bailey2002,
  author = {Jeffrey A Bailey and Zhiping Gu and Royden A Clark and Knut Reinert
    and Rhea V Samonte and Stuart Schwartz and Mark D Adams and Eugene
    W Myers and Peter W Li and Evan E Eichler},
  title = {Recent segmental duplications in the human genome.},
  journal = {Science},
  year = {2002},
  volume = {297},
  pages = {1003--1007},
  month = {Aug},
  abstract = {Primate-specific segmental duplications are considered important in
	human disease and evolution. The inability to distinguish between
	allelic and duplication sequence overlap has hampered their characterization
	as well as assembly and annotation of our genome. We developed a
	method whereby each public sequence is analyzed at the clone level
	for overrepresentation within a whole-genome shotgun sequence. This
	test has the ability to detect duplications larger than 15 kilobases
	irrespective of copy number, location, or high sequence similarity.
	We mapped 169 large regions flanked by highly similar duplications.
	Twenty-four of these hot spots of genomic instability have been associated
	with genetic disease. Our analysis indicates a highly nonrandom chromosomal
	and genic distribution of recent segmental duplications, with a likely
	role in expanding protein diversity.},
  
  file = {main:Bailey2002.pdf:PDF;supp_text:Bailey2002-supp.pdf:PDF;supp_table:Bailey2002-supptable.pdf:PDF},
  institution = {Department of Genetics, Center for Computational Genomics, and Center
	for Human Genetics, Case Western Reserve University School of Medicine
	and University Hospitals of Cleveland, Cleveland, OH 44106, USA.},
  keywords = {Alleles; Base Sequence; Biological Evolution; Chromosomes, Human,
	genetics; Computational Biology; Databases, Nucleic Acid; Exons;
	Expressed Sequence Tags; Gene Duplication; Gene Rearrangement; Genes,
	Duplicate; Genetic Diseases, Inborn, genetics; Genome, Human; Humans;
	Models, Genetic; Polymorphism, Single Nucleotide; Proteome; Recombination,
	Genetic; Sequence Alignment},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {297/5583/1003},
  pmid = {12169732},
  timestamp = {2011.07.19},
}

@ARTICLE{Bailey2008,
  author = {J. A. Bailey and J. M. Kidd and E. E. Eichler},
  title = {Human copy number polymorphic genes.},
  journal = {Cytogenet Genome Res},
  year = {2008},
  volume = {123},
  pages = {234--243},
  abstract = {Recent large-scale genomic studies within human populations have identified
	numerous genomic regions as copy number variant (CNV). As these CNV
	regions often overlap coding regions of the genome, large lists of
	potentially copy number polymorphic genes have been produced that
	are candidates for disease association. Most of the current data
	regarding normal genic variation, however, has been generated using
	BAC or SNP microarrays, which lack precision especially with respect
	to exons. To address this, we assessed 2,790 candidate CNV genes
	defined from available studies in nine well-characterized HapMap
	individuals by designing a customized oligonucleotide microarray
	targeted specifically to exons. Using exon array comparative genomic
	hybridization (aCGH), we detected 255 (9\%) of the candidates as
	true CNVs including 134 with evidence of variation over the entire
	gene. Individuals differed in copy number from the control by an
	average of 100 gene loci. Both partial- and whole-gene CNVs were
	strongly associated with segmental duplications (55 and 71\%, respectively)
	as well as regions of positive selection. We confirmed 37\% of the
	whole-gene CNVs using the fosmid end sequence pair (ESP) structural
	variation map for these same individuals. If we modify the end sequence
	pair mapping strategy to include low-sequence identity ESPs (98-99.5\%)
	and ESPs with an everted orientation, we can capture 82\% of the
	missed genes leading to more complete ascertainment of structural
	variation within duplicated genes. Our results indicate that segmental
	duplications are the source of the majority of full-length copy number
	polymorphic genes, most of the variant genes are organized as tandem
	duplications, and a significant fraction of these genes will represent
	paralogs with levels of sequence diversity beyond thresholds of allelic
	variation. In addition, these data provide a targeted set of CNV
	genes enriched for regions likely to be associated with human phenotypic
	differences due to copy number changes and present a source of copy
	number responsive oligonucleotide probes for future association studies.},
  
  institution = {Department of Pathology, Case Western University School of Medicine
	and University Hospitals of Cleveland, Cleveland, OH, USA. jab@case.edu},
  keywords = {Algorithms; Comparative Genomic Hybridization; Exons, genetics; False
	Negative Reactions; False Positive Reactions; Gene Dosage, genetics;
	Humans; Polymorphism, Genetic, genetics},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {000184713},
  pmid = {19287160},
  timestamp = {2011.07.19},
}

@ARTICLE{Bailey2003a,
  author = {Jeffrey A Bailey and Ge Liu and Evan E Eichler},
  title = {An Alu transposition model for the origin and expansion of human
	segmental duplications.},
  journal = {Am J Hum Genet},
  year = {2003},
  volume = {73},
  pages = {823--834},
  month = {Oct},
  abstract = {Relative to genomes of other sequenced organisms, the human genome
	appears particularly enriched for large, highly homologous segmental
	duplications (> or =90\% sequence identity and > or =10 kbp in length).
	The molecular basis for this enrichment is unknown. We sought to
	gain insight into the mechanism of origin, by systematically examining
	sequence features at the junctions of duplications. We analyzed 9,464
	junctions within regions of high-quality finished sequence from a
	genomewide set of 2,366 duplication alignments. We observed a highly
	significant (P<.0001) enrichment of Alu short interspersed element
	(SINE) sequences near or within the junction. Twenty-seven percent
	of all segmental duplications terminated within an Alu repeat. The
	Alu junction enrichment was most pronounced for interspersed segmental
	duplications separated by > or =1 Mb of intervening sequence. Alu
	elements at the junctions showed higher levels of divergence, consistent
	with Alu-Alu-mediated recombination events. When we classified Alu
	elements into major subfamilies, younger elements (AluY and AluS)
	accounted for the enrichment, whereas the oldest primate family (AluJ)
	showed no enrichment. We propose that the primate-specific burst
	of Alu retroposition activity (which occurred 35-40 million years
	ago) sensitized the ancestral human genome for Alu-Alu-mediated recombination
	events, which, in turn, initiated the expansion of gene-rich segmental
	duplications and their subsequent role in nonallelic homologous recombination.},
  
  institution = {Department of Genetics, Center for Computational Genomics, Case Western
	Reserve University School of Medicine and University Hospitals of
	Cleveland, Cleveland, OH, 44106, USA.},
  keywords = {Alu Elements, genetics; Chromosome Mapping; Gene Duplication; Genetic
	Variation; Genome, Human; Humans; Models, Genetic; Sequence Alignment;
	Sequence Homology, Nucleic Acid},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {S0002-9297(07)63631-3},
  pmid = {14505274},
  timestamp = {2011.07.19},
}

@ARTICLE{Bailey2001,
  author = {J. A. Bailey and A. M. Yavor and H. F. Massa and B. J. Trask and
	E. E. Eichler},
  title = {Segmental duplications: organization and impact within the current
	human genome project assembly.},
  journal = {Genome Res},
  year = {2001},
  volume = {11},
  pages = {1005--1017},
  month = {Jun},
  abstract = {Segmental duplications play fundamental roles in both genomic disease
	and gene evolution. To understand their organization within the human
	genome, we have developed the computational tools and methods necessary
	to detect identity between long stretches of genomic sequence despite
	the presence of high copy repeats and large insertion-deletions.
	Here we present our analysis of the most recent genome assembly (January
	2001) in which we focus on the global organization of these segments
	and the role they play in the whole-genome assembly process. Initially,
	we considered only large recent duplication events that fell well-below
	levels of draft sequencing error (alignments 90\%-98\% similar and
	> or =1 kb in length). Duplications (90\%-98\%; > or =1 kb) comprise
	3.6\% of all human sequence. These duplications show clustering and
	up to 10-fold enrichment within pericentromeric and subtelomeric
	regions. In terms of assembly, duplicated sequences were found to
	be over-represented in unordered and unassigned contigs indicating
	that duplicated sequences are difficult to assign to their proper
	position. To assess coverage of these regions within the genome,
	we selected BACs containing interchromosomal duplications and characterized
	their duplication pattern by FISH. Only 47\% (106/224) of chromosomes
	positive by FISH had a corresponding chromosomal position by comparison.
	We present data that indicate that this is attributable to misassembly,
	misassignment, and/or decreased sequencing coverage within duplicated
	regions. Surprisingly, if we consider putative duplications >98\%
	identity, we identify 10.6\% (286 Mb) of the current assembly as
	paralogous. The majority of these alignments, we believe, represent
	unmerged overlaps within unique regions. Taken together the above
	data indicate that segmental duplications represent a significant
	impediment to accurate human genome assembly, requiring the development
	of specialized techniques to finish these exceptional regions of
	the genome. The identification and characterization of these highly
	duplicated regions represents an important step in the complete sequencing
	of a human reference genome.},
  
  file = {main:Bailey2001.pdf:PDF;tabS1:Bailey2001-tabS1.xls:Excel;tabS2:Bailey2001-tabS2.xls:Excel},
  institution = {Department of Genetics and Center for Human Genetics, Case Western
	Reserve School of Medicine and University Hospitals of Cleveland,
	Cleveland, Ohio 44106, USA.},
  keywords = {Base Sequence; Centromere, genetics; Computational Biology, trends;
	Contig Mapping, trends; Databases, Factual; Gene Duplication; Human
	Genome Project; Humans; Molecular Sequence Data; Telomere, genetics},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pmid = {11381028},
  timestamp = {2011.07.19},
}

@ARTICLE{Bailey2002a,
  author = {Jeffrey A Bailey and Amy M Yavor and Luigi Viggiano and Doriana Misceo
    and Juliann E Horvath and Nicoletta Archidiacono and Stuart Schwartz
    and Mariano Rocchi and Evan E Eichler},
  title = {Human-specific duplication and mosaic transcripts: the recent paralogous
	structure of chromosome 22.},
  journal = {Am J Hum Genet},
  year = {2002},
  volume = {70},
  pages = {83--100},
  month = {Jan},
  abstract = {In recent decades, comparative chromosomal banding, chromosome painting,
	and gene-order studies have shown strong conservation of gross chromosome
	structure and gene order in mammals. However, findings from the human
	genome sequence suggest an unprecedented degree of recent (<35 million
	years ago) segmental duplication. This dynamism of segmental duplications
	has important implications in disease and evolution. Here we present
	a chromosome-wide view of the structure and evolution of the most
	highly homologous duplications (> or = 1 kb and > or = 90\%) on chromosome
	22. Overall, 10.8\% (3.7/33.8 Mb) of chromosome 22 is duplicated,
	with an average sequence identity of 95.4\%. To organize the duplications
	into tractable units, intron-exon structure and well-defined duplication
	boundaries were used to define 78 duplicated modules (minimally shared
	evolutionary segments) with 157 copies on chromosome 22. Analysis
	of these modules provides evidence for the creation or modification
	of 11 novel transcripts. Comparative FISH analyses of human, chimpanzee,
	gorilla, orangutan, and macaque reveal qualitative and quantitative
	differences in the distribution of these duplications--consistent
	with their recent origin. Several duplications appear to be human
	specific, including a approximately 400-kb duplication (99.4\%-99.8\%
	sequence identity) that transposed from chromosome 14 to the most
	proximal pericentromeric region of chromosome 22. Experimental and
	in silico data further support a pericentromeric gradient of duplications
	where the most recent duplications transpose adjacent to the centromere.
	Taken together, these data suggest that segmental duplications have
	been an ongoing process of primate genome evolution, contributing
	to recent gene innovation and the dynamic transformation of genome
	architecture within and among closely related species.},
  
  institution = {Department of Genetics and Center for Human Genetics, Case Western
	Reserve University School of Medicine and University Hospitals of
	Cleveland, OH, USA.},
  keywords = {Animals; Centromere, genetics; Chromosomes, Human, Pair 14, genetics;
	Chromosomes, Human, Pair 22, genetics; Evolution, Molecular; Exons,
	genetics; Gene Dosage; Gene Duplication; Genes, Duplicate, genetics;
	Humans; In Situ Hybridization, Fluorescence; Introns, genetics; Mosaicism,
	genetics; Primates, genetics; RNA, Messenger, analysis/genetics;
	Species Specificity; Time Factors; Transcription, Genetic, genetics;
	Translocation, Genetic, genetics},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {S0002-9297(07)61285-3},
  pmid = {11731936},
  timestamp = {2011.07.19},
}

@ARTICLE{Bashir2008,
  author = {Ali Bashir and Stanislav Volik and Colin Collins and Vineet Bafna
	and Benjamin J Raphael},
  title = {Evaluation of paired-end sequencing strategies for detection of genome
	rearrangements in cancer.},
  journal = {PLoS Comput Biol},
  year = {2008},
  volume = {4},
  pages = {e1000051},
  month = {Apr},
  abstract = {Paired-end sequencing is emerging as a key technique for assessing
	genome rearrangements and structural variation on a genome-wide scale.
	This technique is particularly useful for detecting copy-neutral
	rearrangements, such as inversions and translocations, which are
	common in cancer and can produce novel fusion genes. We address the
	question of how much sequencing is required to detect rearrangement
	breakpoints and to localize them precisely using both theoretical
	models and simulation. We derive a formula for the probability that
	a fusion gene exists in a cancer genome given a collection of paired-end
	sequences from this genome. We use this formula to compute fusion
	gene probabilities in several breast cancer samples, and we find
	that we are able to accurately predict fusion genes in these samples
	with a relatively small number of fragments of large size. We further
	demonstrate how the ability to detect fusion genes depends on the
	distribution of gene lengths, and we evaluate how different parameters
	of a sequencing strategy impact breakpoint detection, breakpoint
	localization, and fusion gene detection, even in the presence of
	errors that suggest false rearrangements. These results will be useful
	in calibrating future cancer sequencing efforts, particularly large-scale
	studies of many cancer genomes that are enabled by next-generation
	sequencing technologies.},
  
  institution = {Bioinformatics Graduate Program, University of California San Diego,
	San Diego, California, United States of America. abashir@ucsd.edu},
  keywords = {Algorithms; Base Sequence; Breast Neoplasms; Chromosome Mapping; Female;
	Gene Rearrangement; Humans; Molecular Sequence Data; Sequence Analysis,
	DNA},
  owner = {calkan},
  pmid = {18404202},
  timestamp = {2009.01.12},
}

@ARTICLE{Batzer2002,
  author = {Mark A Batzer and Prescott L Deininger},
  title = {Alu repeats and human genomic diversity.},
  journal = {Nat Rev Genet},
  year = {2002},
  volume = {3},
  pages = {370--379},
  month = {May},
  abstract = {During the past 65 million years, Alu elements have propagated to
	more than one million copies in primate genomes, which has resulted
	in the generation of a series of Alu subfamilies of different ages.
	Alu elements affect the genome in several ways, causing insertion
	mutations, recombination between elements, gene conversion and alterations
	in gene expression. Alu-insertion polymorphisms are a boon for the
	study of human population genetics and primate comparative genomics
	because they are neutral genetic markers of identical descent with
	known ancestral states.},
  
  institution = {Department of Biological Sciences, Biological Computation and Visualization
	Center, Louisiana State University, 202 Life Sciences Building, Baton
	Rouge, Louisiana 70803, USA. mbatzer@lsu.edu},
  keywords = {Alu Elements, genetics; Base Sequence; Evolution, Molecular; Gene
	Conversion; Genetic Variation; Genome, Human; Humans; Molecular Sequence
	Data; Polymorphism, Genetic; Recombination, Genetic; Retroelements,
	genetics; Sequence Alignment},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {nrg798},
  pmid = {11988762},
  timestamp = {2011.07.22},
}

@ARTICLE{Bekpen2005,
  author = {Cemalettin Bekpen and Julia P Hunn and Christoph Rohde and Iana Parvanova
	and Libby Guethlein and Diane M Dunn and Eva Glowalla and Maria Leptin
	and Jonathan C Howard},
  title = {The interferon-inducible p47 {(IRG) GTPases} in vertebrates: loss
	of the cell autonomous resistance mechanism in the human lineage.},
  journal = {Genome Biol},
  year = {2005},
  volume = {6},
  pages = {R92},
  abstract = {BACKGROUND: Members of the p47 (immunity-related GTPases (IRG) family)
	GTPases are essential, interferon-inducible resistance factors in
	mice that are active against a broad spectrum of important intracellular
	pathogens. Surprisingly, there are no reports of p47 function in
	humans. RESULTS: Here we show that the p47 GTPases are represented
	by 23 genes in the mouse, whereas humans have only a single full-length
	p47 GTPase and an expressed, truncated presumed pseudo-gene. The
	human full-length gene is orthologous to an isolated mouse p47 GTPase
	that carries no interferon-inducible elements in the promoter of
	either species and is expressed constitutively in the mature testis
	of both species. Thus, there is no evidence for a p47 GTPase-based
	resistance system in humans. Dogs have several interferon-inducible
	p47s, and so the primate lineage that led to humans appears to have
	lost an ancient function. Multiple p47 GTPases are also present in
	the zebrafish, but there is only a tandem p47 gene pair in pufferfish.
	CONCLUSION: Mice and humans must deploy their immune resources against
	vacuolar pathogens in radically different ways. This carries significant
	implications for the use of the mouse as a model of human infectious
	disease. The absence of the p47 resistance system in humans suggests
	that possession of this resistance system carries significant costs
	that, in the primate lineage that led to humans, are not outweighed
	by the benefits. The origin of the vertebrate p47 system is obscure.},
  
  keywords = {Amino Acid Sequence; Animals; Dogs; Enhancer Elements (Genetics);
	Evolution, Molecular; GTP Phosphohydrolases; Genome, Human; Humans;
	Immunity, Natural; Interferons; Mice; Molecular Sequence Data; Multigene
	Family; Phylogeny; Promoter Regions (Genetics); Sequence Alignment;
	Sequence Analysis, {DNA}; Sequence Homology, Amino Acid; Synteny},
  owner = {calkan},
  pii = {gb-2005-6-11-r92},
  pmid = {16277747},
  timestamp = {2007.04.11},
}

@ARTICLE{Bekpen2009,
  author = {Cemalettin Bekpen and Tomas Marques-Bonet and Can Alkan and Francesca
	Antonacci and Maria Bruna Leogrande and Mario Ventura and Jeffrey
	M Kidd and Priscillia Siswara and Jonathan C Howard and Evan E Eichler},
  title = {Death and resurrection of the human {\it IRGM} gene.},
  journal = {PLoS Genet},
  year = {2009},
  volume = {5},
  pages = {e1000403},
  month = {Mar},
  abstract = {Immunity-related GTPases (IRG) play an important role in defense against
	intracellular pathogens. One member of this gene family in humans,
	IRGM, has been recently implicated as a risk factor for Crohn's disease.
	We analyzed the detailed structure of this gene family among primates
	and showed that most of the IRG gene cluster was deleted early in
	primate evolution, after the divergence of the anthropoids from prosimians
	( about 50 million years ago). Comparative sequence analysis of New
	World and Old World monkey species shows that the single-copy IRGM
	gene became pseudogenized as a result of an Alu retrotransposition
	event in the anthropoid common ancestor that disrupted the open reading
	frame (ORF). We find that the ORF was reestablished as a part of
	a polymorphic stop codon in the common ancestor of humans and great
	apes. Expression analysis suggests that this change occurred in conjunction
	with the insertion of an endogenous retrovirus, which altered the
	transcription initiation, splicing, and expression profile of IRGM.
	These data argue that the gene became pseudogenized and was then
	resurrected through a series of complex structural events and suggest
	remarkable functional plasticity where alleles experience diverse
	evolutionary pressures over time. Such dynamism in structure and
	evolution may be critical for a gene family locked in an arms race
	with an ever-changing repertoire of intracellular parasites.},
  
  file = {main:Bekpen2009.pdf:PDF;supp_note:Bekpen2009-supp.pdf:PDF;figS1:Bekpen2009-figS1.pdf:PDF;figS2:Bekpen2009-figS2.pdf:PDF;figS3:Bekpen2009-figS3.pdf:PDF;figS4:Bekpen2009-figS4.pdf:PDF;figS5:Bekpen2009-figS5.pdf:PDF;figS6:Bekpen2009-figS6.pdf:PDF},
  institution = {Department of Genome Sciences, University of Washington, Seattle,
	Washington, United States of America.},
  keywords = {Animals; Evolution, Molecular; GTP-Binding Proteins; Gene Expression;
	Humans; Multigene Family; Mutagenesis, Insertional; Phylogeny; Primates;
	Pseudogenes; Retroelements},
  owner = {calkan},
  pmid = {19266026},
  timestamp = {2009.09.18},
}

@ARTICLE{illumina,
  author = {David R Bentle},
  title = {Whole-genome re-sequencing},
  year = {2006},
  institution = {Current Opinion in Genetics \& Development}
}

@ARTICLE{Bentley2006,
  author = {David R Bentley},
  title = {Whole-genome re-sequencing.},
  journal = {Curr Opin Genet Dev},
  year = {2006},
  volume = {16},
  pages = {545--552},
  month = {Dec},
  abstract = {{DNA} sequencing can be used to gain important information on genes,
	genetic variation and gene function for biological and medical studies.
	The growing collection of publicly available reference genome sequences
	will underpin a new era of whole genome re-sequencing, but sequencing
	costs need to fall and throughput needs to rise by several orders
	of magnitude. Novel technologies are being developed to meet this
	need by generating massive amounts of sequence that can be aligned
	to the reference sequence. The challenge is to maintain the high
	standards of accuracy and completeness that are hallmarks of the
	previous genome projects. One or more new sequencing technologies
	are expected to become the mainstay of future research, and to make
	{DNA} sequencing centre stage as a routine tool in genetic research
	in the coming years.},
  
  keywords = {Genome, Human; Genomics; Humans; Sequence Analysis, {DNA}},
  owner = {calkan},
  pii = {S0959-437X(06)00208-5},
  pmid = {17055251},
  timestamp = {2007.05.08},
}

@ARTICLE{Bentley2008,
  author = {David R Bentley and Shankar Balasubramanian and Harold P Swerdlow
	and Geoffrey P Smith and John Milton and Clive G Brown and Kevin
	P Hall and Dirk J Evers and Colin L Barnes and Helen R Bignell and
	Jonathan M Boutell and Jason Bryant and Richard J Carter and R. Keira
	Cheetham and Anthony J Cox and Darren J Ellis and Michael R Flatbush
	and Niall A Gormley and Sean J Humphray and Leslie J Irving and Mirian
	S Karbelashvili and Scott M Kirk and Heng Li and Xiaohai Liu and
	Klaus S Maisinger and Lisa J Murray and Bojan Obradovic and Tobias
	Ost and Michael L Parkinson and Mark R Pratt and Isabelle M J Rasolonjatovo
	and Mark T Reed and Roberto Rigatti and Chiara Rodighiero and Mark
	T Ross and Andrea Sabot and Subramanian V Sankar and Aylwyn Scally
	and Gary P Schroth and Mark E Smith and Vincent P Smith and Anastassia
	Spiridou and Peta E Torrance and Svilen S Tzonev and Eric H Vermaas
	and Klaudia Walter and Xiaolin Wu and Lu Zhang and Mohammed D Alam
	and Carole Anastasi and Ify C Aniebo and David M D Bailey and Iain
	R Bancarz and Saibal Banerjee and Selena G Barbour and Primo A Baybayan
	and Vincent A Benoit and Kevin F Benson and Claire Bevis and Phillip
	J Black and Asha Boodhun and Joe S Brennan and John A Bridgham and
	Rob C Brown and Andrew A Brown and Dale H Buermann and Abass A Bundu
	and James C Burrows and Nigel P Carter and Nestor Castillo and Maria
	Chiara E Catenazzi and Simon Chang and R. Neil Cooley and Natasha
	R Crake and Olubunmi O Dada and Konstantinos D Diakoumakos and Belen
	Dominguez-Fernandez and David J Earnshaw and Ugonna C Egbujor and
	David W Elmore and Sergey S Etchin and Mark R Ewan and Milan Fedurco
	and Louise J Fraser and Karin V Fuentes Fajardo and W. Scott Furey
	and David George and Kimberley J Gietzen and Colin P Goddard and
	George S Golda and Philip A Granieri and David E Green and David
	L Gustafson and Nancy F Hansen and Kevin Harnish and Christian D
	Haudenschild and Narinder I Heyer and Matthew M Hims and Johnny T
	Ho and Adrian M Horgan and Katya Hoschler and Steve Hurwitz and Denis
	V Ivanov and Maria Q Johnson and Terena James and T. A. Huw Jones
	and Gyoung-Dong Kang and Tzvetana H Kerelska and Alan D Kersey and
	Irina Khrebtukova and Alex P Kindwall and Zoya Kingsbury and Paula
	I Kokko-Gonzales and Anil Kumar and Marc A Laurent and Cynthia T
	Lawley and Sarah E Lee and Xavier Lee and Arnold K Liao and Jennifer
	A Loch and Mitch Lok and Shujun Luo and Radhika M Mammen and John
	W Martin and Patrick G McCauley and Paul McNitt and Parul Mehta and
	Keith W Moon and Joe W Mullens and Taksina Newington and Zemin Ning
	and Bee Ling Ng and Sonia M Novo and Michael J O'Neill and Mark A
	Osborne and Andrew Osnowski and Omead Ostadan and Lambros L Paraschos
	and Lea Pickering and Andrew C Pike and Alger C Pike and D. Chris
	Pinkard and Daniel P Pliskin and Joe Podhasky and Victor J Quijano
	and Come Raczy and Vicki H Rae and Stephen R Rawlings and Ana Chiva
	Rodriguez and Phyllida M Roe and John Rogers and Maria C Rogert Bacigalupo
	and Nikolai Romanov and Anthony Romieu and Rithy K Roth and Natalie
	J Rourke and Silke T Ruediger and Eli Rusman and Raquel M Sanches-Kuiper
	and Martin R Schenker and Josefina M Seoane and Richard J Shaw and
	Mitch K Shiver and Steven W Short and Ning L Sizto and Johannes P
	Sluis and Melanie A Smith and Jean Ernest Sohna Sohna and Eric J
	Spence and Kim Stevens and Neil Sutton and Lukasz Szajkowski and
	Carolyn L Tregidgo and Gerardo Turcatti and Stephanie Vandevondele
	and Yuli Verhovsky and Selene M Virk and Suzanne Wakelin and Gregory
	C Walcott and Jingwen Wang and Graham J Worsley and Juying Yan and
	Ling Yau and Mike Zuerlein and Jane Rogers and James C Mullikin and
	Matthew E Hurles and Nick J McCooke and John S West and Frank L Oaks
	and Peter L Lundberg and David Klenerman and Richard Durbin and Anthony
	J Smith},
  title = {Accurate whole human genome sequencing using reversible terminator
	chemistry.},
  journal = {Nature},
  year = {2008},
  volume = {456},
  pages = {53--59},
  month = {Nov},
  abstract = {DNA sequence information underpins genetic research, enabling discoveries
	of important biological or medical benefit. Sequencing projects have
	traditionally used long (400-800 base pair) reads, but the existence
	of reference sequences for the human and many other genomes makes
	it possible to develop new, fast approaches to re-sequencing, whereby
	shorter reads are compared to a reference to identify intraspecies
	genetic variation. Here we report an approach that generates several
	billion bases of accurate nucleotide sequence per experiment at low
	cost. Single molecules of DNA are attached to a flat surface, amplified
	in situ and used as templates for synthetic sequencing with fluorescent
	reversible terminator deoxyribonucleotides. Images of the surface
	are analysed to generate high-quality sequence. We demonstrate application
	of this approach to human genome sequencing on flow-sorted X chromosomes
	and then scale the approach to determine the genome sequence of a
	male Yoruba from Ibadan, Nigeria. We build an accurate consensus
	sequence from >30x average depth of paired 35-base reads. We characterize
	four million single-nucleotide polymorphisms and four hundred thousand
	structural variants, many of which were previously unknown. Our approach
	is effective for accurate, rapid and economical whole-genome re-sequencing
	and many other biomedical applications.},
  
  institution = {Illumina Cambridge Ltd. (Formerly Solexa Ltd), Chesterford Research
	Park, Little Chesterford, Nr Saffron Walden, Essex CB10 1XL, UK.
	dbentley@illumina.com},
  keywords = {Chromosomes, Human, X; Consensus Sequence; Genome, Human; Genomics;
	Genotype; Humans; Male; Nigeria; Polymorphism, Single Nucleotide;
	Sensitivity and Specificity; Sequence Analysis, DNA},
  owner = {calkan},
  pii = {nature07517},
  pmid = {18987734},
  timestamp = {2009.01.12},
}

@ARTICLE{saruman,
  author = {Jochen Blom and Tobias Jakobi and Daniel Doppmeier and Sebastian
	Jaenicke and Jörn Kalinowski and Jens Stoye and Alexander Goesmann},
  title = {Exact and complete short-read alignment to microbial genomes using
	Graphics Processing Unit programming},
  journal = {Bioinformatics},
  year = {2011}
}

@ARTICLE{Branton2008,
  author = {Daniel Branton and David W Deamer and Andre Marziali and Hagan Bayley
	and Steven A Benner and Thomas Butler and Massimiliano Di Ventra
	and Slaven Garaj and Andrew Hibbs and Xiaohua Huang and Stevan B
	Jovanovich and Predrag S Krstic and Stuart Lindsay and Xinsheng Sean
	Ling and Carlos H Mastrangelo and Amit Meller and John S Oliver and
	Yuriy V Pershin and J. Michael Ramsey and Robert Riehn and Gautam
	V Soni and Vincent Tabard-Cossa and Meni Wanunu and Matthew Wiggin
	and Jeffery A Schloss},
  title = {The potential and challenges of nanopore sequencing.},
  journal = {Nat Biotechnol},
  year = {2008},
  volume = {26},
  pages = {1146--1153},
  month = {Oct},
  abstract = {A nanopore-based device provides single-molecule detection and analytical
	capabilities that are achieved by electrophoretically driving molecules
	in solution through a nano-scale pore. The nanopore provides a highly
	confined space within which single nucleic acid polymers can be analyzed
	at high throughput by one of a variety of means, and the perfect
	processivity that can be enforced in a narrow pore ensures that the
	native order of the nucleobases in a polynucleotide is reflected
	in the sequence of signals that is detected. Kilobase length polymers
	(single-stranded genomic DNA or RNA) or small molecules (e.g., nucleosides)
	can be identified and characterized without amplification or labeling,
	a unique analytical capability that makes inexpensive, rapid DNA
	sequencing a possibility. Further research and development to overcome
	current challenges to nanopore identification of each successive
	nucleotide in a DNA strand offers the prospect of 'third generation'
	instruments that will sequence a diploid mammalian genome for approximately
	$1,000 in approximately 24 h.},
  
  institution = {Department of Molecular and Cell Biology, Harvard University, Cambridge,
	Massachusetts 02138, USA. dbranton@harvard.edu},
  keywords = {Chromosome Mapping; DNA; Forecasting; Genomics; Nanostructures; Nanotechnology;
	Sequence Alignment; Sequence Analysis, DNA},
  owner = {calkan},
  pii = {nbt.1495},
  pmid = {18846088},
  timestamp = {2009.04.17},
}

@ARTICLE{Burrows94ablock-sorting,
  author = {M. Burrows and D. J. Wheeler and M. Burrows and D. J. Wheeler},
  title = {A block-sorting lossless data compression algorithm},
  year = {1994}
}

@ARTICLE{genome_sequence_2,
  author = {Campbell, P.J. and et al.},
  title = {Identification of somatically acquired rearrangements in cancer usinggenome-wide
	massively parallel paired-end sequencing},
  year = {2008},
  institution = {Nat. Genet.}
}

@ARTICLE{Campbell2008,
  author = {Peter J Campbell and Philip J Stephens and Erin D Pleasance and Sarah
	O'Meara and Heng Li and Thomas Santarius and Lucy A Stebbings and
	Catherine Leroy and Sarah Edkins and Claire Hardy and Jon W Teague
	and Andrew Menzies and Ian Goodhead and Daniel J Turner and Christopher
	M Clee and Michael A Quail and Antony Cox and Clive Brown and Richard
	Durbin and Matthew E Hurles and Paul A W Edwards and Graham R Bignell
	and Michael R Stratton and P. Andrew Futreal},
  title = {Identification of somatically acquired rearrangements in cancer using
	genome-wide massively parallel paired-end sequencing.},
  journal = {Nat Genet},
  year = {2008},
  volume = {40},
  pages = {722--729},
  month = {Jun},
  abstract = {Human cancers often carry many somatically acquired genomic rearrangements,
	some of which may be implicated in cancer development. However, conventional
	strategies for characterizing rearrangements are laborious and low-throughput
	and have low sensitivity or poor resolution. We used massively parallel
	sequencing to generate sequence reads from both ends of short DNA
	fragments derived from the genomes of two individuals with lung cancer.
	By investigating read pairs that did not align correctly with respect
	to each other on the reference human genome, we characterized 306
	germline structural variants and 103 somatic rearrangements to the
	base-pair level of resolution. The patterns of germline and somatic
	rearrangement were markedly different. Many somatic rearrangements
	were from amplicons, although rearrangements outside these regions,
	notably including tandem duplications, were also observed. Some somatic
	rearrangements led to abnormal transcripts, including two from internal
	tandem duplications and two fusion transcripts created by interchromosomal
	rearrangements. Germline variants were predominantly mediated by
	retrotransposition, often involving AluY and LINE elements. The results
	demonstrate the feasibility of systematic, genome-wide characterization
	of rearrangements in complex human cancer genomes, raising the prospect
	of a new harvest of genes associated with cancer using this strategy.},
  
  institution = {Wellcome Trust Sanger Institute, Hinxton CB10 1SA, UK.},
  keywords = {Base Pairing; Chromosome Mapping; Computational Biology; Gene Dosage;
	Gene Rearrangement; Genome, Human; Humans; Lung Neoplasms; RNA, Messenger;
	Repetitive Sequences, Nucleic Acid; Reverse Transcriptase Polymerase
	Chain Reaction; Sequence Analysis, DNA; Variation (Genetics)},
  owner = {calkan},
  pii = {ng.128},
  pmid = {18438408},
  timestamp = {2008.10.01},
}

@ARTICLE{Cellamare2009,
  author = {A. Cellamare and C. R. Catacchio and C. Alkan and G. Giannuzzi and
	F. Antonacci and M. F. Cardone and G. Della Valle and M. Malig and
	M. Rocchi and E. E. Eichler and M. Ventura},
  title = {New insights into centromere organization and evolution from the
	white-cheeked gibbon and marmoset.},
  journal = {Mol Biol Evol},
  year = {2009},
  volume = {26},
  pages = {1889--1900},
  month = {Aug},
  abstract = {The evolutionary history of alpha-satellite DNA, the major component
	of primate centromeres, is hardly defined because of the difficulty
	in its sequence assembly and its rapid evolution when compared with
	most genomic sequences. By using several approaches, we have cloned,
	sequenced, and characterized alpha-satellite sequences from two species
	representing critical nodes in the primate phylogeny: the white-cheeked
	gibbon, a lesser ape, and marmoset, a New World monkey. Sequence
	analyses demonstrate that white-cheeked gibbon and marmoset alpha-satellite
	sequences are formed by units of approximately 171 and approximately
	342 bp, respectively, and they both lack the high-order structure
	found in humans and great apes. Fluorescent in situ hybridization
	characterization shows a broad dispersal of alpha-satellite in the
	white-cheeked gibbon genome including centromeric, telomeric, and
	chromosomal interstitial localizations. On the other hand, centromeres
	in marmoset appear organized in highly divergent dimers roughly of
	342 bp that show a similarity between monomers much lower than previously
	reported dimers, thus representing an ancient dimeric structure.
	All these data shed light on the evolution of the centromeric sequences
	in Primates. Our results suggest radical differences in the structure,
	organization, and evolution of alpha-satellite DNA among different
	primate species, supporting the notion that 1) all the centromeric
	sequence in Primates evolved by genomic amplification, unequal crossover,
	and sequence homogenization using a 171 bp monomer as the basic seeding
	unit and 2) centromeric function is linked to relatively short repeated
	elements, more than higher-order structure. Moreover, our data indicate
	that complex higher-order repeat structures are a peculiarity of
	the hominid lineage, showing the more complex organization in humans.},
  
  file = {main:Cellamare2009.pdf:PDF;figS1:Cellamare2009-figS1.jpg:JPG image;figS2:Cellamare2009-figS2.ppt:PowerPoint;supp_tabs:Cellamare2009-supp.xls:Excel},
  institution = {Department of Genetics and Microbiology, University of Bari, Bari,
	Italy.},
  keywords = {Animals; Callithrix, genetics; Cell Line; Centromere, genetics; Evolution;
	Humans; Hylobates, genetics; Primates, genetics},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {msp101},
  pmid = {19429672},
  timestamp = {2010.09.15},
}

@ARTICLE{Chaisson2004,
  author = {Mark Chaisson and Pavel Pevzner and Haixu Tang},
  title = {Fragment assembly with short reads.},
  journal = {Bioinformatics},
  year = {2004},
  volume = {20},
  pages = {2067--2074},
  month = {Sep},
  abstract = {MOTIVATION: Current DNA sequencing technology produces reads of about
	500-750 bp, with typical coverage under 10x. New sequencing technologies
	are emerging that produce shorter reads (length 80-200 bp) but allow
	one to generate significantly higher coverage (30x and higher) at
	low cost. Modern assembly programs and error correction routines
	have been tuned to work well with current read technology but were
	not designed for assembly of short reads. RESULTS: We analyze the
	limitations of assembling reads generated by these new technologies
	and present a routine for base-calling in reads prior to their assembly.
	We demonstrate that while it is feasible to assemble such short reads,
	the resulting contigs will require significant (if not prohibitive)
	finishing efforts. AVAILABILITY: Available from the web at http://www.cse.ucsd.edu/groups/bioinformatics/software.html},
  
  file = {main:Chaisson2004.pdf:PDF},
  keywords = {Algorithms; Base Sequence; Contig Mapping; Feasibility Studies; Gene
	Expression Profiling; Molecular Sequence Data; Sequence Alignment;
	Sequence Analysis, DNA},
  owner = {calkan},
  pii = {bth205},
  pmid = {15059830},
  timestamp = {2008.03.02},
}

@ARTICLE{Chaisson2009,
  author = {Mark J Chaisson and Dumitru Brinza and Pavel A Pevzner},
  title = {De novo fragment assembly with short mate-paired reads: Does the
	read length matter?},
  journal = {Genome Res},
  year = {2009},
  volume = {19},
  pages = {336--346},
  month = {Feb},
  abstract = {Increasing read length is currently viewed as the crucial condition
	for fragment assembly with next-generation sequencing technologies.
	However, introducing mate-paired reads (separated by a gap of length,
	GapLength) opens a possibility to transform short mate-pairs into
	long mate-reads of length approximately GapLength, and thus raises
	the question as to whether the read length (as opposed to GapLength)
	even matters. We describe a new tool, EULER-USR, for assembling mate-paired
	short reads and use it to analyze the question of whether the read
	length matters. We further complement the ongoing experimental efforts
	to maximize read length by a new computational approach for increasing
	the effective read length. While the common practice is to trim the
	error-prone tails of the reads, we present an approach that substitutes
	trimming with error correction using repeat graphs. An important
	and counterintuitive implication of this result is that one may extend
	sequencing reactions that degrade with length "past their prime"
	to where the error rate grows above what is normally acceptable for
	fragment assembly.},
  
  file = {main:Chaisson2009.pdf:PDF},
  institution = {Bioinformatics Program, University of California San Diego, La Jolla,
	California 92093, USA. mchaisso@bioinf.ucsd.edu},
  keywords = {Algorithms; Base Pairing; Base Sequence; Bias (Epidemiology); Chromosome
	Mapping; Computational Biology; Escherichia coli; Genetic Markers;
	Genome, Bacterial; Models, Biological; Sequence Alignment; Sequence
	Analysis, DNA},
  owner = {calkan},
  pii = {gr.079053.108},
  pmid = {19056694},
  timestamp = {2009.04.12},
}

@ARTICLE{Chaisson2008,
  author = {Mark J Chaisson and Pavel A Pevzner},
  title = {Short read fragment assembly of bacterial genomes.},
  journal = {Genome Res},
  year = {2008},
  volume = {18},
  pages = {324--330},
  month = {Feb},
  abstract = {In the last year, high-throughput sequencing technologies have progressed
	from proof-of-concept to production quality. While these methods
	produce high-quality reads, they have yet to produce reads comparable
	in length to Sanger-based sequencing. Current fragment assembly algorithms
	have been implemented and optimized for mate-paired Sanger-based
	reads, and thus do not perform well on short reads produced by short
	read technologies. We present a new Eulerian assembler that generates
	nearly optimal short read assemblies of bacterial genomes and describe
	an approach to assemble reads in the case of the popular hybrid protocol
	when short and long Sanger-based reads are combined.},
  
  file = {main:Chaisson2008.pdf:PDF},
  owner = {calkan},
  pii = {gr.7088808},
  pmid = {18083777},
  timestamp = {2008.03.02},
}

@ARTICLE{Chen2007a,
  author = {Ken Chen and Michael D McLellan and Li Ding and Michael C Wendl and
	Yumi Kasai and Richard K Wilson and Elaine R Mardis},
  title = {{PolyScan:} an automatic indel and {SNP} detection approach to the
	analysis of human resequencing data.},
  journal = {Genome Res},
  year = {2007},
  volume = {17},
  pages = {659--666},
  month = {May},
  abstract = {Small insertions and deletions (indels) and single nucleotide polymorphisms
	(SNPs) are common genetic variants that are thought to be associated
	with a wide variety of human diseases. Owing to the genome's size
	and complexity, manually characterizing each one of these variations
	in an individual is not practical. While significant progress has
	been made in automated single-base mutation discovery from the sequences
	of diploid PCR products, automated and reliable detection of indels
	continues to pose difficult challenges. In this paper, we present
	PolyScan, an algorithm and software implementation designed to provide
	de novo heterozygous indel detection and improved SNP identification
	in the context of high-throughput medical resequencing. Tests on
	a human diploid PCR-based sequence data set, consisting of 90,270
	traces from 13 genes, indicate that PolyScan identified approximately
	90\% of the 151 consensus indel sites and approximately 84\% of the
	1546 heterozygous indels previously identified by manual inspection.
	Tests on tumor-derived data show that PolyScan better identifies
	high-quality, low-level mutations as compared with other mutation
	detection software. Moreover, SNP identification improves when reprocessing
	the results of other programs. These results suggest that PolyScan
	may play a useful role in the post human genome project research
	era.},
  
  institution = {Genome Sequencing Center, Washington University School of Medicine,
	St. Louis, Missouri 63108, USA. kchen22@wustl.edu},
  keywords = {Algorithms; Base Sequence; Humans; Molecular Sequence Data; Polymorphism,
	Single Nucleotide; Sequence Alignment; Sequence Analysis, DNA; Sequence
	Deletion; Software},
  owner = {calkan},
  pii = {gr.6151507},
  pmid = {17416743},
  timestamp = {2009.09.22},
}

@ARTICLE{Chen2009,
  author = {Ken Chen and John W Wallis and Michael D McLellan and David E Larson
	and Joelle M Kalicki and Craig S Pohl and Sean D McGrath and Michael
	C Wendl and Qunyuan Zhang and Devin P Locke and Xiaoqi Shi and Robert
	S Fulton and Timothy J Ley and Richard K Wilson and Li Ding and Elaine
	R Mardis},
  title = {{BreakDancer}: an algorithm for high-resolution mapping of genomic
	structural variation.},
  journal = {Nat Methods},
  year = {2009},
  volume = {6},
  pages = {677--681},
  month = {Sep},
  abstract = {Detection and characterization of genomic structural variation are
	important for understanding the landscape of genetic variation in
	human populations and in complex diseases such as cancer. Recent
	studies demonstrate the feasibility of detecting structural variation
	using next-generation, short-insert, paired-end sequencing reads.
	However, the utility of these reads is not entirely clear, nor are
	the analysis methods with which accurate detection can be achieved.
	The algorithm BreakDancer predicts a wide variety of structural variants
	including insertion-deletions (indels), inversions and translocations.
	We examined BreakDancer's performance in simulation, in comparison
	with other methods and in analyses of a sample from an individual
	with acute myeloid leukemia and of samples from the 1,000 Genomes
	trio individuals. BreakDancer sensitively and accurately detected
	indels ranging from 10 base pairs to 1 megabase pair that are difficult
	to detect via a single conventional approach.},
  
  file = {main:Chen2009.pdf:PDF},
  institution = {The Genome Center, Washington University School of Medicine, St.
	Louis, Missouri, USA. kchen22@wustl.edu},
  keywords = {Algorithms; Base Sequence; Computer Simulation; DNA; Genetic Variation;
	Genome, Human; Genomics; Humans; Leukemia, Myeloid, Acute; Sequence
	Analysis, DNA},
  owner = {calkan},
  pii = {nmeth.1363},
  pmid = {19668202},
  timestamp = {2009.09.22},
}

@ARTICLE{genome_sequence_4,
  author = {Chiang, D.Y. and et al.},
  title = {High-resolution mapping of copy-number alterations with massively
	parallel sequencing},
  year = {2009},
  institution = {Nature Methods}
}

@ARTICLE{Chiang2009,
  author = {Derek Y Chiang and Gad Getz and David B Jaffe and Michael J T O'Kelly
	and Xiaojun Zhao and Scott L Carter and Carsten Russ and Chad Nusbaum
	and Matthew Meyerson and Eric S Lander},
  title = {High-resolution mapping of copy-number alterations with massively
	parallel sequencing.},
  journal = {Nat Methods},
  year = {2009},
  volume = {6},
  pages = {99--103},
  month = {Jan},
  abstract = {Cancer results from somatic alterations in key genes, including point
	mutations, copy-number alterations and structural rearrangements.
	A powerful way to discover cancer-causing genes is to identify genomic
	regions that show recurrent copy-number alterations (gains and losses)
	in tumor genomes. Recent advances in sequencing technologies suggest
	that massively parallel sequencing may provide a feasible alternative
	to DNA microarrays for detecting copy-number alterations. Here we
	present: (i) a statistical analysis of the power to detect copy-number
	alterations of a given size; (ii) SegSeq, an algorithm to segment
	equal copy numbers from massively parallel sequence data; and (iii)
	analysis of experimental data from three matched pairs of tumor and
	normal cell lines. We show that a collection of approximately 14
	million aligned sequence reads from human cell lines has comparable
	power to detect events as the current generation of DNA microarrays
	and has over twofold better precision for localizing breakpoints
	(typically, to within approximately 1 kilobase).},
  
  institution = {Broad Institute, Massachusetts Institute of Technology, 7 Cambridge
	Center, Cambridge, MA 02142, USA.},
  keywords = {Algorithms; Base Sequence; Cell Line, Tumor; Chromosomes, Human; Databases,
	Genetic; Gene Dosage; Humans},
  owner = {calkan},
  pii = {nmeth.1276},
  pmid = {19043412},
  timestamp = {2009.09.22},
}

@ARTICLE{Clarke2009,
  author = {James Clarke and Hai-Chen Wu and Lakmal Jayasinghe and Alpesh Patel
	and Stuart Reid and Hagan Bayley},
  title = {Continuous base identification for single-molecule nanopore {DNA}
	sequencing.},
  journal = {Nat Nanotechnol},
  year = {2009},
  volume = {4},
  pages = {265--270},
  month = {Apr},
  abstract = {A single-molecule method for sequencing DNA that does not require
	fluorescent labelling could reduce costs and increase sequencing
	speeds. An exonuclease enzyme might be used to cleave individual
	nucleotide molecules from the DNA, and when coupled to an appropriate
	detection system, these nucleotides could be identified in the correct
	order. Here, we show that a protein nanopore with a covalently attached
	adapter molecule can continuously identify unlabelled nucleoside
	5'-monophosphate molecules with accuracies averaging 99.8\%. Methylated
	cytosine can also be distinguished from the four standard DNA bases:
	guanine, adenine, thymine and cytosine. The operating conditions
	are compatible with the exonuclease, and the kinetic data show that
	the nucleotides have a high probability of translocation through
	the nanopore and, therefore, of not being registered twice. This
	highly accurate tool is suitable for integration into a system for
	sequencing nucleic acids and for analysing epigenetic modifications.},
  
  institution = {Oxford Nanopore Technologies Ltd, Begbroke Science Park, Sandy Lane,
	Oxford OX5 1PF, UK.},
  owner = {calkan},
  pii = {nnano.2009.12},
  pmid = {19350039},
  timestamp = {2009.04.12},
}

@ARTICLE{Conrad2010,
  author = {Donald F Conrad and Dalila Pinto and Richard Redon and Lars Feuk
	and Omer Gokcumen and Yujun Zhang and Jan Aerts and T. Daniel Andrews
	and Chris Barnes and Peter Campbell and Tomas Fitzgerald and Min
	Hu and Chun Hwa Ihm and Kati Kristiansson and Daniel G Macarthur
	and Jeffrey R Macdonald and Ifejinelo Onyiah and Andy Wing Chun Pang
	and Sam Robson and Kathy Stirrups and Armand Valsesia and Klaudia
	Walter and John Wei and Wellcome Trust Case Control Consortium and
	Chris Tyler-Smith and Nigel P Carter and Charles Lee and Stephen
	W Scherer and Matthew E Hurles},
  title = {Origins and functional impact of copy number variation in the human
	genome.},
  journal = {Nature},
  year = {2010},
  volume = {464},
  pages = {704--712},
  month = {Apr},
  abstract = {Structural variations of DNA greater than 1 kilobase in size account
	for most bases that vary among human genomes, but are still relatively
	under-ascertained. Here we use tiling oligonucleotide microarrays,
	comprising 42 million probes, to generate a comprehensive map of
	11,700 copy number variations (CNVs) greater than 443 base pairs,
	of which most (8,599) have been validated independently. For 4,978
	of these CNVs, we generated reference genotypes from 450 individuals
	of European, African or East Asian ancestry. The predominant mutational
	mechanisms differ among CNV size classes. Retrotransposition has
	duplicated and inserted some coding and non-coding DNA segments randomly
	around the genome. Furthermore, by correlation with known trait-associated
	single nucleotide polymorphisms (SNPs), we identified 30 loci with
	CNVs that are candidates for influencing disease susceptibility.
	Despite this, having assessed the completeness of our map and the
	patterns of linkage disequilibrium between CNVs and SNPs, we conclude
	that, for complex traits, the heritability void left by genome-wide
	association studies will not be accounted for by common CNVs.},
  
  institution = {The Wellcome Trust Sanger Institute, Wellcome Trust Genome Campus,
	Hinxton, Cambridge, CB10 1SA UK.},
  keywords = {Continental Population Groups, genetics; DNA Copy Number Variations,
	genetics; Gene Duplication; Genetic Predisposition to Disease, genetics;
	Genome, Human, genetics; Genome-Wide Association Study; Genotype;
	Haplotypes, genetics; Humans; Mutagenesis, genetics; Oligonucleotide
	Array Sequence Analysis; Polymorphism, Single Nucleotide, genetics;
	Reproducibility of Results},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {nature08516},
  pmid = {19812545},
  timestamp = {2011.07.19},
}

@ARTICLE{Cooper2007,
  author = {Gregory M Cooper and Deborah A Nickerson and Evan E Eichler},
  title = {Mutational and selective effects on copy-number variants in the human
	genome.},
  journal = {Nat Genet},
  year = {2007},
  volume = {39},
  pages = {S22--S29},
  month = {Jul},
  abstract = {Comprehensive descriptions of large insertion/deletion or segmental
	duplication polymorphisms (SDs) in the human genome have recently
	been generated. These annotations, known collectively as structural
	or copy-number variants (CNVs), include thousands of discrete genomic
	regions and span hundreds of millions of nucleotides. Here we review
	the genomic distribution of CNVs, which is strongly correlated with
	gene, repeat and segmental duplication content. We explore the evolutionary
	mechanisms giving rise to this nonrandom distribution, considering
	the available data on both human polymorphisms and the fixed changes
	that differentiate humans from other species. It is likely that mutational
	biases, selective effects and interactions between these forces all
	contribute substantially to the spectrum of human copy-number variation.
	Although defining these variants with nucleotide-level precision
	remains a largely unmet but critical challenge, our understanding
	of their potential medical impact and evolutionary importance is
	rapidly emerging.},
  
  keywords = {Animals; Evolution; Gene Dosage; Gene Duplication; Genome, Human;
	Humans; Models, Genetic; Mutation; Pan troglodytes; Selection (Genetics);
	Species Specificity; Variation (Genetics)},
  owner = {calkan},
  pii = {ng2054},
  pmid = {17597777},
  timestamp = {2008.03.02},
}

@ARTICLE{Cooper2008,
  author = {Gregory M Cooper and Troy Zerr and Jeffrey M Kidd and Evan E Eichler
	and Deborah A Nickerson},
  title = {Systematic assessment of copy number variant detection via genome-wide
	{SNP} genotyping.},
  journal = {Nat Genet},
  year = {2008},
  volume = {40},
  pages = {1199--1203},
  month = {Oct},
  abstract = {SNP genotyping has emerged as a technology to incorporate copy number
	variants (CNVs) into genetic analyses of human traits. However, the
	extent to which SNP platforms accurately capture CNVs remains unclear.
	Using independent, sequence-based CNV maps, we find that commonly
	used SNP platforms have limited or no probe coverage for a large
	fraction of CNVs. Despite this, in 9 samples we inferred 368 CNVs
	using Illumina SNP genotyping data and experimentally validated over
	two-thirds of these. We also developed a method (SNP-Conditional
	Mixture Modeling, SCIMM) to robustly genotype deletions using as
	few as two SNP probes. We find that HapMap SNPs are strongly correlated
	with 82\% of common deletions, but the newest SNP platforms effectively
	tag about 50\%. We conclude that currently available genome-wide
	SNP assays can capture CNVs accurately, but improvements in array
	designs, particularly in duplicated sequences, are necessary to facilitate
	more comprehensive analyses of genomic variation.},
  
  institution = {Department of Genome Sciences, University of Washington, Seattle,
	Washington 98195, USA. coopergm@u.washington.edu},
  owner = {calkan},
  pii = {ng.236},
  pmid = {18776910},
  timestamp = {2008.09.29},
}

@ARTICLE{Day2007,
  author = {Nathan Day and Andrew Hemmaplardh and Robert E Thurman and John A
	Stamatoyannopoulos and William S Noble},
  title = {Unsupervised segmentation of continuous genomic data.},
  journal = {Bioinformatics},
  year = {2007},
  volume = {23},
  pages = {1424--1426},
  month = {Jun},
  abstract = {The advent of high-density, high-volume genomic data has created the
	need for tools to summarize large datasets at multiple scales. HMMSeg
	is a command-line utility for the scale-specific segmentation of
	continuous genomic data using hidden Markov models (HMMs). Scale
	specificity is achieved by an optional wavelet-based smoothing operation.
	HMMSeg is capable of handling multiple datasets simultaneously, rendering
	it ideal for integrative analysis of expression, phylogenetic and
	functional genomic data. AVAILABILITY: http://noble.gs.washington.edu/proj/hmmseg},
  
  file = {main:Day2007.pdf:PDF},
  institution = {Department of Computer Science and Engineering, University of Washington,
	Seattle, WA, USA.},
  keywords = {Algorithms; Artificial Intelligence; Chromosome Mapping; Computer
	Simulation; Databases, Genetic; Information Storage and Retrieval;
	Markov Chains; Models, Genetic; Models, Statistical; Pattern Recognition,
	Automated; Sequence Analysis, DNA},
  owner = {calkan},
  pii = {btm096},
  pmid = {17384021},
  timestamp = {2009.09.21},
}

@ARTICLE{Dew2005,
  author = {Ian M Dew and Brian Walenz and Granger Sutton},
  title = {A tool for analyzing mate pairs in assemblies ({TAMPA}).},
  journal = {J Comput Biol},
  year = {2005},
  volume = {12},
  pages = {497--513},
  month = {Jun},
  abstract = {The current generation of genome assembly programs uses distance and
	orientation relationships of paired end reads of clones (mate pairs)
	to order and orient contigs. Mate pair data can also be used to evaluate
	and compare assemblies after the fact. Earlier work employed a simple
	heuristic to detect assembly problems by scanning across an assembly
	to locate peak concentrations of unsatisfied mate pairs. TAMPA is
	a novel, computational geometry-based approach to detecting assembly
	breakpoints by exploiting constraints that mate pairs impose on each
	other. The method can be used to improve assemblies and determine
	which of two assemblies is correct in the case of sequence disagreement.
	Results from several human genome assemblies are presented.},
  
  institution = {Steck Consulting, LLC, 2121 K Street NW, Suite 700, Washington, DC
	20037, USA. ian@catmandew.com},
  keywords = {Algorithms; Computational Biology; Contig Mapping; Genome, Human;
	Humans; Sequence Analysis, DNA},
  owner = {calkan},
  pmid = {15952874},
  timestamp = {2009.01.12},
}

@ARTICLE{Dohm2007,
  author = {Juliane C Dohm and Claudio Lottaz and Tatiana Borodina and Heinz
	Himmelbauer},
  title = {{SHARCGS}, a fast and highly accurate short-read assembly algorithm
	for de novo genomic sequencing},
  journal = {Genome Res},
  year = {2007},
  volume = {17},
  pages = {1697--1706},
  month = {Nov},
  abstract = {The latest revolution in the DNA sequencing field has been brought
	about by the development of automated sequencers that are capable
	of generating giga base pair data sets quickly and at low cost. Applications
	of such technologies seem to be limited to resequencing and transcript
	discovery, due to the shortness of the generated reads. In order
	to extend the fields of application to de novo sequencing, we developed
	the SHARCGS algorithm to assemble short-read (25-40-mer) data with
	high accuracy and speed. The efficiency of SHARCGS was tested on
	BAC inserts from three eukaryotic species, on two yeast chromosomes,
	and on two bacterial genomes (Haemophilus influenzae, Escherichia
	coli). We show that 30-mer-based BAC assemblies have N50 sizes >20
	kbp for Drosophila and Arabidopsis and >4 kbp for human in simulations
	taking missing reads and wrong base calls into account. We assembled
	949,974 contigs with length >50 bp, and only one single contig could
	not be aligned error-free against the reference sequences. We generated
	36-mer reads for the genome of Helicobacter acinonychis on the Illumina
	1G sequencing instrument and assembled 937 contigs covering 98\%
	of the genome with an N50 size of 3.7 kbp. With the exception of
	five contigs that differ in 1-4 positions relative to the reference
	sequence, all contigs matched the genome error-free. Thus, SHARCGS
	is a suitable tool for fully exploiting novel sequencing technologies
	by assembling sequence contigs de novo with high confidence and by
	outperforming existing assembly algorithms in terms of speed and
	accuracy.},
  
  keywords = {Algorithms; Chromosomes, Artificial, Bacterial; Contig Mapping; Genomics;
	Humans; Sequence Analysis, DNA},
  owner = {calkan},
  pii = {gr.6435207},
  pmid = {17908823},
  timestamp = {2008.03.02},
}

@ARTICLE{Duitama2011,
  author = {Jorge Duitama and Gayle K McEwen and Thomas Huebsch and Stefanie
	Palczewski and Sabrina Schulz and Kevin Verstrepen and Eun-Kyung
	Suk and Margret R Hoehe},
  title = {Fosmid-based whole genome haplotyping of a HapMap trio child: evaluation
	of Single Individual Haplotyping techniques.},
  journal = {Nucleic Acids Res},
  year = {2011},
  month = {Nov},
  abstract = {Determining the underlying haplotypes of individual human genomes
	is an essential, but currently difficult, step toward a complete
	understanding of genome function. Fosmid pool-based next-generation
	sequencing allows genome-wide generation of 40-kb haploid DNA segments,
	which can be phased into contiguous molecular haplotypes computationally
	by Single Individual Haplotyping (SIH). Many SIH algorithms have
	been proposed, but the accuracy of such methods has been difficult
	to assess due to the lack of real benchmark data. To address this
	problem, we generated whole genome fosmid sequence data from a HapMap
	trio child, NA12878, for which reliable haplotypes have already been
	produced. We assembled haplotypes using eight algorithms for SIH
	and carried out direct comparisons of their accuracy, completeness
	and efficiency. Our comparisons indicate that fosmid-based haplotyping
	can deliver highly accurate results even at low coverage and that
	our SIH algorithm, ReFHap, is able to efficiently produce high-quality
	haplotypes. We expanded the haplotypes for NA12878 by combining the
	current haplotypes with our fosmid-based haplotypes, producing near-to-complete
	new gold-standard haplotypes containing almost 98\% of heterozygous
	SNPs. This improvement includes notable fractions of disease-related
	and GWA SNPs. Integrated with other molecular biological data sets,
	this phase information will advance the emerging field of diploid
	genomics.},
  
  file = {main:Duitama2011.pdf:PDF},
  institution = { Laboratory for Genetics and Genomics, Center of Microbial and Plant
	Genetics, K.U.Leuven, Gaston Geenslaan 1, B-3001 Leuven (Heverlee),
	Belgium.},
  language = {eng},
  medline-pst = {aheadofprint},
  owner = {calkan},
  pii = {gkr1042},
  pmid = {22102577},
  timestamp = {2012.03.07},
}

@ARTICLE{Eichler2001,
  author = {E. E. Eichler and M. E. Johnson and C. Alkan and E. Tuzun and C.
	Sahinalp and D. Misceo and N. Archidiacono and M. Rocchi},
  title = {Divergent origins and concerted expansion of two segmental duplications
	on chromosome 16.},
  journal = {J Hered},
  year = {2001},
  volume = {92},
  pages = {462--468},
  abstract = {An unexpected finding of the human genome was the large fraction of
	the genome organized as blocks of interspersed duplicated sequence.
	We provide a comparative and phylogenetic analysis of a highly duplicated
	region of 16p12.2, which is composed of at least four different segmental
	duplications spanning in excess of 160 kb. We contrast the dispersal
	of two different segmental duplications (LCR16a and LCR16u). LCR16a,
	a 20 kb low-copy repeat sequence A from chromosome 16, was shown
	previously to contain a rapidly evolving novel hominoid gene family
	(morpheus) that had expanded within the last 10 million years of
	great ape/human evolution. We compare the dispersal of this genomic
	segment with a second adjacent duplication called LCR16u. The duplication
	contains a second putative gene family (KIAA0220/SMG1) that is represented
	approximately eight times within the human genome. A high degree
	of sequence identity (approximately 98\%) was observed among the
	various copies of LCR16u. Comparative analyses with Old World monkey
	species show that LCR16a and LCR16u originated from two distinct
	ancestral loci. Within the human genome, at least 70\% of the LCR16u
	copies were duplicated in concert with the LCR16a duplication. In
	contrast, only 30\% of the chimpanzee loci show an association between
	LCR16a and LCR16u duplications. The data suggest that the two copies
	of genomic sequence were brought together during the chimpanzee/human
	divergence and were subsequently duplicated as a larger cassette
	specifically within the human lineage. The evolutionary history of
	these two chromosome-specific duplications supports a model of rapid
	expansion and evolutionary turnover among the genomes of man and
	the great apes.},
  institution = {Department of Genetics and Center for Human Genetics, Case Western
	Reserve School of Medicine and University Hospitals of Cleveland,
	Cleveland, OH 44106, USA. eee@po.cwru.edu},
  keywords = {Animals; Chromosomes, Human, Pair 16, genetics; Evolution, Molecular;
	Gene Duplication; Genome, Human; Humans; In Situ Hybridization, Fluorescence;
	Phylogeny; Sequence Analysis, DNA},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pmid = {11948212},
  timestamp = {2011.07.19},
}

@ARTICLE{Eichler2008,
  author = {Evan E Eichler and Andrew W Zimmerman},
  title = {A hot spot of genetic instability in autism.},
  journal = {N Engl J Med},
  year = {2008},
  volume = {358},
  pages = {737--739},
  month = {Feb},
  
  keywords = {Autistic Disorder; Chromosome Aberrations; Chromosome Deletion; Chromosomes,
	Human, Pair 16; Genetic Predisposition to Disease; Humans; Phenotype},
  owner = {calkan},
  pii = {NEJMe0708756},
  pmid = {18184953},
  timestamp = {2009.09.19},
}

@ARTICLE{Eid2009,
  author = {John Eid and Adrian Fehr and Jeremy Gray and Khai Luong and John
	Lyle and Geoff Otto and Paul Peluso and David Rank and Primo Baybayan
	and Brad Bettman and Arkadiusz Bibillo and Keith Bjornson and Bidhan
	Chaudhuri and Frederick Christians and Ronald Cicero and Sonya Clark
	and Ravindra Dalal and Alex Dewinter and John Dixon and Mathieu Foquet
	and Alfred Gaertner and Paul Hardenbol and Cheryl Heiner and Kevin
	Hester and David Holden and Gregory Kearns and Xiangxu Kong and Ronald
	Kuse and Yves Lacroix and Steven Lin and Paul Lundquist and Congcong
	Ma and Patrick Marks and Mark Maxham and Devon Murphy and Insil Park
	and Thang Pham and Michael Phillips and Joy Roy and Robert Sebra
	and Gene Shen and Jon Sorenson and Austin Tomaney and Kevin Travers
	and Mark Trulson and John Vieceli and Jeffrey Wegener and Dawn Wu
	and Alicia Yang and Denis Zaccarin and Peter Zhao and Frank Zhong
	and Jonas Korlach and Stephen Turner},
  title = {Real-time {DNA} sequencing from single polymerase molecules.},
  journal = {Science},
  year = {2009},
  volume = {323},
  pages = {133--138},
  month = {Jan},
  abstract = {We present single-molecule, real-time sequencing data obtained from
	a DNA polymerase performing uninterrupted template-directed synthesis
	using four distinguishable fluorescently labeled deoxyribonucleoside
	triphosphates (dNTPs). We detected the temporal order of their enzymatic
	incorporation into a growing DNA strand with zero-mode waveguide
	nanostructure arrays, which provide optical observation volume confinement
	and enable parallel, simultaneous detection of thousands of single-molecule
	sequencing reactions. Conjugation of fluorophores to the terminal
	phosphate moiety of the dNTPs allows continuous observation of DNA
	synthesis over thousands of bases without steric hindrance. The data
	report directly on polymerase dynamics, revealing distinct polymerization
	states and pause sites corresponding to DNA secondary structure.
	Sequence data were aligned with the known reference sequence to assay
	biophysical parameters of polymerization for each template position.
	Consensus sequences were generated from the single-molecule reads
	at 15-fold coverage, showing a median accuracy of 99.3\%, with no
	systematic error beyond fluorophore-dependent error rates.},
  
  institution = {, CA 94025, USA.},
  keywords = {Base Sequence; Consensus Sequence; DNA; DNA, Circular; DNA, Single-Stranded;
	DNA-Directed DNA Polymerase; Deoxyribonucleotides; Enzymes, Immobilized;
	Fluorescent Dyes; Kinetics; Nanostructures; Sequence Analysis, DNA;
	Spectrometry, Fluorescence},
  owner = {calkan},
  pii = {1162986},
  pmid = {19023044},
  timestamp = {2009.04.12},
}

@ARTICLE{Farrar2007,
  author = {Michael Farrar},
  title = {Striped {Smith-Waterman} speeds database searches six times over
	other {SIMD} implementations.},
  journal = {Bioinformatics},
  year = {2007},
  volume = {23},
  pages = {156--161},
  month = {Jan},
  abstract = {MOTIVATION: The only algorithm guaranteed to find the optimal local
	alignment is the Smith-Waterman. It is also one of the slowest due
	to the number of computations required for the search. To speed up
	the algorithm, Single-Instruction Multiple-Data (SIMD) instructions
	have been used to parallelize the algorithm at the instruction level.
	RESULTS: A faster implementation of the Smith-Waterman algorithm
	is presented. This algorithm achieved 2-8 times performance improvement
	over other SIMD based Smith-Waterman implementations. On a 2.0 GHz
	Xeon Core 2 Duo processor, speeds of >3.0 billion cell updates/s
	were achieved. AVAILABILITY: http://farrar.michael.googlepages.com/Smith-waterman},
  
  keywords = {Algorithms; Database Management Systems; Databases, Genetic; Information
	Storage and Retrieval; Sequence Alignment; Sequence Analysis; Time
	Factors},
  owner = {calkan},
  pii = {btl582},
  pmid = {17110365},
  timestamp = {2008.03.02},
}

@ARTICLE{Fellermann2006,
  author = {Klaus Fellermann and Daniel E Stange and Elke Schaeffeler and Hartmut
	Schmalzl and Jan Wehkamp and Charles L Bevins and Walter Reinisch
	and Alexander Teml and Matthias Schwab and Peter Lichter and Bernhard
	Radlwimmer and Eduard F Stange},
  title = {A chromosome 8 gene-cluster polymorphism with low human beta-defensin
	2 gene copy number predisposes to {Crohn} disease of the colon.},
  journal = {Am J Hum Genet},
  year = {2006},
  volume = {79},
  pages = {439--448},
  month = {Sep},
  abstract = {Defensins are endogenous antimicrobial peptides that protect the intestinal
	mucosa against bacterial invasion. It has been suggested that deficient
	defensin expression may underlie the chronic inflammation of Crohn
	disease (CD). The DNA copy number of the beta-defensin gene cluster
	on chromosome 8p23.1 is highly polymorphic within the healthy population,
	which suggests that the defective beta-defensin induction in colonic
	CD could be due to low beta-defensin-gene copy number. Here, we tested
	this hypothesis, using genomewide DNA copy number profiling by array-based
	comparative genomic hybridization and quantitative polymerase-chain-reaction
	analysis of the human beta-defensin 2 (HBD-2) gene. We showed that
	healthy individuals, as well as patients with ulcerative colitis,
	have a median of 4 (range 2-10) HBD-2 gene copies per genome. In
	a surgical cohort with ileal or colonic CD and in a second large
	cohort with inflammatory bowel diseases, those with ileal resections/disease
	exhibited a normal median HBD-2 copy number of 4, whereas those with
	colonic CD had a median of only 3 copies per genome (P=.008 for the
	surgical cohort; P=.032 for the second cohort). Overall, the copy
	number distribution in colonic CD was shifted to lower numbers compared
	with controls (P=.002 for both the surgical cohort and the cohort
	with inflammatory bowel diseases). Individuals with < or = 3 copies
	have a significantly higher risk of developing colonic CD than did
	individuals with > or = 4 copies (odds ratio 3.06; 95\% confidence
	interval 1.46-6.45). An HBD-2 gene copy number of < 4 was associated
	with diminished mucosal HBD-2 mRNA expression (P=.033). In conclusion,
	a lower HBD-2 gene copy number in the beta-defensin locus predisposes
	to colonic CD, most likely through diminished beta-defensin expression.},
  
  institution = {Department of Internal Medicine I, Robert-Bosch-Hospital, 70376 Stuttgart,
	Germany. klaus.fellermann@rbk.de},
  keywords = {Chromosomes, Human, Pair 8; Colon; Crohn Disease; DNA Mutational Analysis;
	Gene Dosage; Gene Expression; Genetic Predisposition to Disease;
	Humans; Inflammatory Bowel Diseases; Multigene Family; Oligonucleotide
	Array Sequence Analysis; Polymorphism, Genetic; beta-Defensins},
  owner = {calkan},
  pii = {S0002-9297(07)62743-8},
  pmid = {16909382},
  timestamp = {2009.09.19},
}

@ARTICLE{Ferragina07compressedrepresentations,
  author = {Paolo Ferragina and Giovanni Manzini and Veli Mäkinen and Gonzalo
	Navarro},
  title = {Compressed representations of sequences and full-text indexes},
  journal = {ACM Transactions on Algorithms},
  year = {2007},
  volume = {3},
}

@ARTICLE{Flicek2009,
  author = {Paul Flicek},
  title = {The need for speed.},
  journal = {Genome Biol},
  year = {2009},
  volume = {10},
  pages = {212},
  month = {Mar},
  abstract = {ABSTRACT: DNA sequence data are being produced at an ever-increasing
	rate. The Bowtie sequence-alignment algorithm uses advanced data
	structures to help data analysis keep pace with data generation.},
  
  institution = {European Bioinformatics Institute, Wellcome Trust Genome Campus,
	Hinxton, Cambridge CB10 1SD, UK. flicek@ebi.ac.uk.},
  owner = {calkan},
  pii = {gb-2009-10-3-212},
  pmid = {19344490},
  timestamp = {2009.04.17},
}

@ARTICLE{Freeman2006,
  author = {Jennifer L Freeman and George H Perry and Lars Feuk and Richard Redon
	and Steven A McCarroll and David M Altshuler and Hiroyuki Aburatani
	and Keith W Jones and Chris Tyler-Smith and Matthew E Hurles and
	Nigel P Carter and Stephen W Scherer and Charles Lee},
  title = {Copy number variation: new insights in genome diversity},
  journal = {Genome Res},
  year = {2006},
  volume = {16},
  pages = {949--961},
  month = {Aug},
  abstract = {DNA copy number variation has long been associated with specific chromosomal
	rearrangements and genomic disorders, but its ubiquity in mammalian
	genomes was not fully realized until recently. Although our understanding
	of the extent of this variation is still developing, it seems likely
	that, at least in humans, copy number variants (CNVs) account for
	a substantial amount of genetic variation. Since many CNVs include
	genes that result in differential levels of gene expression, CNVs
	may account for a significant proportion of normal phenotypic variation.
	Current efforts are directed toward a more comprehensive cataloging
	and characterization of CNVs that will provide the basis for determining
	how genomic diversity impacts biological function, evolution, and
	common human diseases.},
  
  keywords = {Animals; Chromosome Mapping; Evolution; Gene Dosage; Gene Expression;
	Genome, Human; Humans; Phenotype},
  owner = {calkan},
  pii = {gr.3677206},
  pmid = {16809666},
  timestamp = {2007.05.11},
}

@ARTICLE{Friedrichs2009,
  author = {Mark S Friedrichs and Peter Eastman and Vishal Vaidyanathan and Mike
	Houston and Scott Legrand and Adam L Beberg and Daniel L Ensign and
	Christopher M Bruns and Vijay S Pande},
  title = {Accelerating molecular dynamic simulation on graphics processing
	units.},
  journal = {J Comput Chem},
  year = {2009},
  volume = {30},
  pages = {864--872},
  month = {Apr},
  abstract = {We describe a complete implementation of all-atom protein molecular
	dynamics running entirely on a graphics processing unit (GPU), including
	all standard force field terms, integration, constraints, and implicit
	solvent. We discuss the design of our algorithms and important optimizations
	needed to fully take advantage of a GPU. We evaluate its performance,
	and show that it can be more than 700 times faster than a conventional
	implementation running on a single CPU core.},
  
  institution = {Department of Bioengineering, Stanford University, Stanford, California
	94305, USA.},
  owner = {calkan},
  pmid = {19191337},
  timestamp = {2009.04.25},
}

@ARTICLE{G10KCOS2009,
  author = {{Genome 10K Community of Scientists}},
  title = {{Genome 10K}: a proposal to obtain whole-genome sequence for 10,000
	vertebrate species.},
  journal = {J Hered},
  year = {2009},
  volume = {100},
  pages = {659--674},
  abstract = {The human genome project has been recently complemented by whole-genome
	assessment sequence of 32 mammals and 24 nonmammalian vertebrate
	species suitable for comparative genomic analyses. Here we anticipate
	a precipitous drop in costs and increase in sequencing efficiency,
	with concomitant development of improved annotation technology and,
	therefore, propose to create a collection of tissue and DNA specimens
	for 10,000 vertebrate species specifically designated for whole-genome
	sequencing in the very near future. For this purpose, we, the Genome
	10K Community of Scientists (G10KCOS), will assemble and allocate
	a biospecimen collection of some 16,203 representative vertebrate
	species spanning evolutionary diversity across living mammals, birds,
	nonavian reptiles, amphibians, and fishes (ca. 60,000 living species).
	In this proposal, we present precise counts for these 16,203 individual
	species with specimens presently tagged and stipulated for DNA sequencing
	by the G10KCOS. DNA sequencing has ushered in a new era of investigation
	in the biological sciences, allowing us to embark for the first time
	on a truly comprehensive study of vertebrate evolution, the results
	of which will touch nearly every aspect of vertebrate biological
	enquiry.},
  
  file = {genome10k:G10KCOS2009.pdf:PDF;genome10ksupp:G10KCOS2009-supp.pdf:PDF},
  keywords = {Animals; Biological Specimen Banks; Genome, genetics; Genomics, economics/methods;
	Species Specificity; Vertebrates, genetics},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {esp086},
  pmid = {19892720},
  timestamp = {2011.06.17},
}

@ARTICLE{Gentleman2004a,
  author = {Robert C Gentleman and Vincent J Carey and Douglas M Bates and Ben
	Bolstad and Marcel Dettling and Sandrine Dudoit and Byron Ellis and
	Laurent Gautier and Yongchao Ge and Jeff Gentry and Kurt Hornik and
	Torsten Hothorn and Wolfgang Huber and Stefano Iacus and Rafael Irizarry
	and Friedrich Leisch and Cheng Li and Martin Maechler and Anthony
	J Rossini and Gunther Sawitzki and Colin Smith and Gordon Smyth and
	Luke Tierney and Jean Y H Yang and Jianhua Zhang},
  title = {Bioconductor: open software development for computational biology
	and bioinformatics.},
  journal = {Genome Biol},
  year = {2004},
  volume = {5},
  pages = {R80},
  abstract = {The Bioconductor project is an initiative for the collaborative creation
	of extensible software for computational biology and bioinformatics.
	The goals of the project include: fostering collaborative development
	and widespread use of innovative software, reducing barriers to entry
	into interdisciplinary scientific research, and promoting the achievement
	of remote reproducibility of research results. We describe details
	of our aims and methods, identify current challenges, compare Bioconductor
	to other open bioinformatics projects, and provide working examples.},
  
  institution = {Department of Biostatistical Science, Dana-Farber Cancer Institute,
	44 Binney St, Boston, MA 02115, USA. rgentlem@jimmy.harvard.edu},
  keywords = {Computational Biology; Internet; Reproducibility of Results; Software},
  owner = {calkan},
  pii = {gb-2004-5-10-r80},
  pmid = {15461798},
  timestamp = {2009.10.25},
}

@ARTICLE{Girirajan2010,
  author = {Santhosh Girirajan and Jill A Rosenfeld and Gregory M Cooper and
	Francesca Antonacci and Priscillia Siswara and Andy Itsara and Laura
	Vives and Tom Walsh and Shane E McCarthy and Carl Baker and Heather
	C Mefford and Jeffrey M Kidd and Sharon R Browning and Brian L Browning
	and Diane E Dickel and Deborah L Levy and Blake C Ballif and Kathryn
	Platky and Darren M Farber and Gordon C Gowans and Jessica J Wetherbee
	and Alexander Asamoah and David D Weaver and Paul R Mark and Jennifer
	Dickerson and Bhuwan P Garg and Sara A Ellingwood and Rosemarie Smith
	and Valerie C Banks and Wendy Smith and Marie T McDonald and Joe
	J Hoo and Beatrice N French and Cindy Hudson and John P Johnson and
	Jillian R Ozmore and John B Moeschler and Urvashi Surti and Luis
	F Escobar and Dima El-Khechen and Jerome L Gorski and Jennifer Kussmann
	and Bonnie Salbert and Yves Lacassie and Alisha Biser and Donna M
	McDonald-McGinn and Elaine H Zackai and Matthew A Deardorff and Tamim
	H Shaikh and Eric Haan and Kathryn L Friend and Marco Fichera and
	Corrado Romano and Jozef Gécz and Lynn E DeLisi and Jonathan Sebat
	and Mary-Claire King and Lisa G Shaffer and Evan E Eichler},
  title = {A recurrent 16p12.1 microdeletion supports a two-hit model for severe
	developmental delay.},
  journal = {Nat Genet},
  year = {2010},
  volume = {42},
  pages = {203--209},
  month = {Mar},
  abstract = {We report the identification of a recurrent, 520-kb 16p12.1 microdeletion
	associated with childhood developmental delay. The microdeletion
	was detected in 20 of 11,873 cases compared with 2 of 8,540 controls
	(P = 0.0009, OR = 7.2) and replicated in a second series of 22 of
	9,254 cases compared with 6 of 6,299 controls (P = 0.028, OR = 2.5).
	Most deletions were inherited, with carrier parents likely to manifest
	neuropsychiatric phenotypes compared to non-carrier parents (P =
	0.037, OR = 6). Probands were more likely to carry an additional
	large copy-number variant when compared to matched controls (10 of
	42 cases, P = 5.7 x 10(-5), OR = 6.6). The clinical features of individuals
	with two mutations were distinct from and/or more severe than those
	of individuals carrying only the co-occurring mutation. Our data
	support a two-hit model in which the 16p12.1 microdeletion both predisposes
	to neuropsychiatric phenotypes as a single event and exacerbates
	neurodevelopmental phenotypes in association with other large deletions
	or duplications. Analysis of other microdeletions with variable expressivity
	indicates that this two-hit model might be more generally applicable
	to neuropsychiatric disease.},
  
  institution = {Department of Genome Sciences, University of Washington School of
	Medicine, Seattle, Washington, USA.},
  keywords = {Adult; Case-Control Studies; Child; Child, Preschool; Chromosome Deletion;
	Chromosomes, Human, Pair 16, genetics; Comparative Genomic Hybridization,
	methods; Developmental Disabilities, genetics; Family; Gene Frequency;
	Humans; Infant; Models, Genetic; Oligonucleotide Array Sequence Analysis;
	Pedigree; Phenotype; Polymorphism, Single Nucleotide; Recurrence;
	Severity of Illness Index},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {ng.534},
  pmid = {20154674},
  timestamp = {2010.09.15},
}

@ARTICLE{Gnerre2011,
  author = {Sante Gnerre and Iain Maccallum and Dariusz Przybylski and Filipe
	J Ribeiro and Joshua N Burton and Bruce J Walker and Ted Sharpe and
	Giles Hall and Terrance P Shea and Sean Sykes and Aaron M Berlin
	and Daniel Aird and Maura Costello and Riza Daza and Louise Williams
	and Robert Nicol and Andreas Gnirke and Chad Nusbaum and Eric S Lander
	and David B Jaffe},
  title = {High-quality draft assemblies of mammalian genomes from massively
	parallel sequence data.},
  journal = {Proc Natl Acad Sci U S A},
  year = {2011},
  volume = {108},
  pages = {1513--1518},
  month = {Jan},
  abstract = {Massively parallel DNA sequencing technologies are revolutionizing
	genomics by making it possible to generate billions of relatively
	short (~100-base) sequence reads at very low cost. Whereas such data
	can be readily used for a wide range of biomedical applications,
	it has proven difficult to use them to generate high-quality de novo
	genome assemblies of large, repeat-rich vertebrate genomes. To date,
	the genome assemblies generated from such data have fallen far short
	of those obtained with the older (but much more expensive) capillary-based
	sequencing approach. Here, we report the development of an algorithm
	for genome assembly, ALLPATHS-LG, and its application to massively
	parallel DNA sequence data from the human and mouse genomes, generated
	on the Illumina platform. The resulting draft genome assemblies have
	good accuracy, short-range contiguity, long-range connectivity, and
	coverage of the genome. In particular, the base accuracy is high
	(≥99.95\%) and the scaffold sizes (N50 size = 11.5 Mb for human and
	7.2 Mb for mouse) approach those obtained with capillary-based sequencing.
	The combination of improved sequencing technology and improved computational
	methods should now make it possible to increase dramatically the
	de novo sequencing of large genomes. The ALLPATHS-LG program is available
	at http://www.broadinstitute.org/science/programs/genome-biology/crd.},
  
  file = {main:Gnerre2011.pdf:PDF},
  institution = {Broad Institute of MIT and Harvard, Cambridge, MA 02142, USA.},
  keywords = {Algorithms; Animals; Genome, genetics; Genomics, methods; Humans;
	Internet; Mice; Reproducibility of Results; Sequence Analysis, DNA,
	methods; Software},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {1017351108},
  pmid = {21187386},
  timestamp = {2012.03.03},
}

@ARTICLE{Gonzalez2005,
  author = {Enrique Gonzalez and Hemant Kulkarni and Hector Bolivar and Andrea
	Mangano and Racquel Sanchez and Gabriel Catano and Robert J Nibbs
	and Barry I Freedman and Marlon P Quinones and Michael J Bamshad
	and Krishna K Murthy and Brad H Rovin and William Bradley and Robert
	A Clark and Stephanie A Anderson and Robert J O'connell and Brian
	K Agan and Seema S Ahuja and Rosa Bologna and Luisa Sen and Matthew
	J Dolan and Sunil K Ahuja},
  title = {The influence of {CCL3L1} gene-containing segmental duplications
	on {HIV-1/AIDS} susceptibility.},
  journal = {Science},
  year = {2005},
  volume = {307},
  pages = {1434--1440},
  month = {Mar},
  abstract = {Segmental duplications in the human genome are selectively enriched
	for genes involved in immunity, although the phenotypic consequences
	for host defense are unknown. We show that there are significant
	interindividual and interpopulation differences in the copy number
	of a segmental duplication encompassing the gene encoding CCL3L1
	(MIP-1alphaP), a potent human immunodeficiency virus-1 (HIV-1)-suppressive
	chemokine and ligand for the HIV coreceptor CCR5. Possession of a
	CCL3L1 copy number lower than the population average is associated
	with markedly enhanced HIV/acquired immunodeficiency syndrome (AIDS)
	susceptibility. This susceptibility is even greater in individuals
	who also possess disease-accelerating CCR5 genotypes. This relationship
	between CCL3L1 dose and altered HIV/AIDS susceptibility points to
	a central role for CCL3L1 in HIV/AIDS pathogenesis and indicates
	that differences in the dose of immune response genes may constitute
	a genetic basis for variable responses to infectious diseases.},
  
  institution = {Veterans Administration Research Center for AIDS and HIV-1 Infection,
	South Texas Veterans Health Care System, and Department of Medicine,
	University of Texas Health Science Center, San Antonio, TX 78229,
	USA.},
  keywords = {Adolescent; Adult; Aged; Animals; Chemokines, CC; Child; Cohort Studies;
	Continental Population Groups; Disease Progression; Ethnic Groups;
	Female; Gene Dosage; Gene Duplication; Genetic Predisposition to
	Disease; Genotype; HIV Infections; HIV-1; Humans; Male; Middle Aged;
	Pan troglodytes; Phenotype; Public Health; Receptors, CCR5; Selection
	(Genetics)},
  owner = {calkan},
  pii = {1101160},
  pmid = {15637236},
  timestamp = {2009.09.19},
}

@ARTICLE{Green2010,
  author = {Richard E Green and Johannes Krause and Adrian W Briggs and Tomislav
    Maricic and Udo Stenzel and Martin Kircher and Nick Patterson and
    Heng Li and Weiwei Zhai and Markus Hsi-Yang Fritz and Nancy F Hansen
    and Eric Y Durand and Anna-Sapfo Malaspinas and Jeffrey D Jensen
    and Tomas Marques-Bonet and Can Alkan and Kay Prüfer and Matthias
    Meyer and Hernán A Burbano and Jeffrey M Good and Rigo Schultz and
    Ayinuer Aximu-Petri and Anne Butthof and Barbara Höber and Barbara
    Höffner and Madlen Siegemund and Antje Weihmann and Chad Nusbaum
    and Eric S Lander and Carsten Russ and others},
  title = {A draft sequence of the {N}eandertal genome.},
  journal = {Science},
  year = {2010},
  volume = {328},
  pages = {710--722},
  month = {May},
  abstract = {Neandertals, the closest evolutionary relatives of present-day humans,
	lived in large parts of Europe and western Asia before disappearing
	30,000 years ago. We present a draft sequence of the Neandertal genome
	composed of more than 4 billion nucleotides from three individuals.
	Comparisons of the Neandertal genome to the genomes of five present-day
	humans from different parts of the world identify a number of genomic
	regions that may have been affected by positive selection in ancestral
	modern humans, including genes involved in metabolism and in cognitive
	and skeletal development. We show that Neandertals shared more genetic
	variants with present-day humans in Eurasia than with present-day
	humans in sub-Saharan Africa, suggesting that gene flow from Neandertals
	into the ancestors of non-Africans occurred before the divergence
	of Eurasian groups from each other.},
  
  file = {main:Green2010.pdf:PDF;supp:Green2010-supp.pdf:PDF},
  institution = {Department of Evolutionary Genetics, Max-Planck Institute for Evolutionary
	Anthropology, D-04103 Leipzig, Germany. green@eva.mpg.de},
  keywords = {African Continental Ancestry Group, genetics; Animals; Base Sequence;
	Bone and Bones; DNA, Mitochondrial, genetics; European Continental
	Ancestry Group, genetics; Evolution, Molecular; Extinction, Biological;
	Female; Fossils; Gene Dosage; Gene Flow; Genetic Variation; Genome;
	Genome, Human; Haplotypes; Hominidae, genetics; Humans; Pan troglodytes,
	genetics; Polymorphism, Single Nucleotide; Selection, Genetic; Sequence
	Alignment; Sequence Analysis, DNA; Time; up, genetics},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {328/5979/710},
  pmid = {20448178},
  timestamp = {2010.09.15},
}

@ARTICLE{bwa,
  author = {Li H. and Durbin R.},
  title = {Fast and accurate short read alignment with Burrows-Wheeler Transform},
  journal = {Bioinformatics},
  year = {2009}
}

@ARTICLE{Hach2010,
  author = {Faraz Hach and Fereydoun Hormozdiari and Can Alkan and Farhad Hormozdiari
    and Inanc Birol and Evan E Eichler and S. Cenk Sahinalp},
  title = {{mrsFAST}: a cache-oblivious algorithm for short-read mapping.},
  journal = {Nat Methods},
  year = {2010},
  volume = {7},
  pages = {576--577},
  month = {Aug},
  
  file = {main:Hach2010.pdf:PDF;supp:Hach2010-supp.pdf:PDF},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {nmeth0810-576},
  pmid = {20676076},
  timestamp = {2010.09.15},
}

@ARTICLE{Hajirasouliha2010,
  author = {Iman Hajirasouliha and Fereydoun Hormozdiari and Can Alkan and Jeffrey
	M Kidd and Inanc Birol and Evan E Eichler and S. Cenk Sahinalp},
  title = {Detection and characterization of novel sequence insertions using
	paired-end next-generation sequencing.},
  journal = {Bioinformatics},
  year = {2010},
  volume = {26},
  pages = {1277--1283},
  month = {May},
  abstract = {MOTIVATION: In the past few years, human genome structural variation
	discovery has enjoyed increased attention from the genomics research
	community. Many studies were published to characterize short insertions,
	deletions, duplications and inversions, and associate copy number
	variants (CNVs) with disease. Detection of new sequence insertions
	requires sequence data, however, the 'detectable' sequence length
	with read-pair analysis is limited by the insert size. Thus, longer
	sequence insertions that contribute to our genetic makeup are not
	extensively researched. RESULTS: We present NovelSeq: a computational
	framework to discover the content and location of long novel sequence
	insertions using paired-end sequencing data generated by the next-generation
	sequencing platforms. Our framework can be built as part of a general
	sequence analysis pipeline to discover multiple types of genetic
	variation (SNPs, structural variation, etc.), thus it requires significantly
	less-computational resources than de novo sequence assembly. We apply
	our methods to detect novel sequence insertions in the genome of
	an anonymous donor and validate our results by comparing with the
	insertions discovered in the same genome using various sources of
	sequence data. AVAILABILITY: The implementation of the NovelSeq pipeline
	is available at http://compbio.cs.sfu.ca/strvar.htm CONTACT: eee@gs.washington.edu;
	cenk@cs.sfu.ca},
  
  file = {main:Hajirasouliha2010.pdf:PDF},
  institution = {Lab for Computational Biology, Simon Fraser University, Burnaby,
	BC, Canada.},
  keywords = {Databases, Genetic; Genetic Variation; Genome, Human; Genomics, methods;
	Humans; Models, Genetic; Mutagenesis, Insertional; Polymorphism,
	Single Nucleotide; Sequence Analysis, DNA, methods},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {btq152},
  pmid = {20385726},
  timestamp = {2010.09.15},
}

@ARTICLE{Hajirasouliha2008,
  author = {Iman Hajirasouliha and Fereydoun Hormozdiari and S. Cenk Sahinalp
	and Inanc Birol},
  title = {Optimal pooling for genome re-sequencing with ultra-high-throughput
	short-read technologies.},
  journal = {Bioinformatics},
  year = {2008},
  volume = {24},
  pages = {i32--i40},
  month = {Jul},
  abstract = {New generation sequencing technologies offer unique opportunities
	and challenges for re-sequencing studies. In this article, we focus
	on re-sequencing experiments using the Solexa technology, based on
	bacterial artificial chromosome (BAC) clones, and address an experimental
	design problem. In these specific experiments, approximate coordinates
	of the BACs on a reference genome are known, and fine-scale differences
	between the BAC sequences and the reference are of interest. The
	high-throughput characteristics of the sequencing technology makes
	it possible to multiplex BAC sequencing experiments by pooling BACs
	for a cost-effective operation. However, the way BACs are pooled
	in such re-sequencing experiments has an effect on the downstream
	analysis of the generated data, mostly due to subsequences common
	to multiple BACs. The experimental design strategy we develop in
	this article offers combinatorial solutions based on approximation
	algorithms for the well-known max n-cut problem and the related max
	n-section problem on hypergraphs. Our algorithms, when applied to
	a number of sample cases give more than a 2-fold performance improvement
	over random partitioning.},
  
  institution = {Lab for Computational Biology, Simon Fraser University, Burnaby,
	BC, Canada.},
  keywords = {Algorithms; Base Sequence; Chromosome Mapping, methods; Chromosomes,
	Artificial, Bacterial, genetics; Molecular Sequence Data; Sequence
	Alignment, methods; Sequence Analysis, DNA, methods},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {btn173},
  pmid = {18586730},
  timestamp = {2010.09.15},
}

@ARTICLE{Halldorsson2011,
  author = {Bjarni V Halldórsson and Derek Aguiar and Sorin Istrail},
  title = {Haplotype phasing by multi-assembly of shared haplotypes: phase-dependent
	interactions between rare variants.},
  journal = {Pac Symp Biocomput},
  year = {2011},
  pages = {88--99},
  abstract = {In this paper we propose algorithmic strategies, Lander-Waterman-like
	statistical estimates, and genome-wide software for haplotype phasing
	by multi-assembly of shared haplotypes. Specifically, we consider
	four types of results which together provide a comprehensive workflow
	of GWAS data sets: (1) statistics of multi-assembly of shared haplotypes
	(2) graph theoretic algorithms for haplotype assembly based on conflict
	graphs of sequencing reads (3) inference of pedigree structure through
	haplotype sharing via tract finding algorithms and (4) multi-assembly
	of shared haplotypes of cases, controls, and trios. The input for
	the workflows that we consider are any of the combination of: (A)
	genotype data (B) next generation sequencing (NGS) (C) pedigree information.
	(1) We present Lander-Waterman-like statistics for NGS projects for
	the multi-assembly of shared haplotypes. Results are presented in
	Sec. 2. (2) In Sec. 3, we present algorithmic strategies for haplotype
	assembly using NGS, NGS + genotype data, and NGS + pedigree information.
	(3) This work builds on algorithms presented in Halldórsson et al.(1)
	and are part of the same library of tools co-developed for GWAS workflows.
	(4) Section 3.3.1 contains algorithmic strategies for multi-assembly
	of GWAS data. We present algorithms for assembling large data sets
	and for determining and using shared haplotypes to more reliably
	assemble and phase the data. Workflows 1-4 provide a set of rigorous
	algorithms which have the potential to identify phase-dependent interactions
	between rare variants in linkage equilibrium which are associated
	with cases. They build on our extensive work on haplotype phasing,(1-3)
	haplotype assembly,(4,5) and whole genome assembly comparison.(6).},
  file = {main:Halldorsson2011.pdf:PDF},
  institution = {School of Science and Engineering, Reykjavik University, Reykjavik,
	Iceland. bjarnivh@ru.is.},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {9789814335058_0010},
  pmid = {21121036},
  timestamp = {2012.03.07},
}

@ARTICLE{Harris2008,
  author = {Timothy D Harris and Phillip R Buzby and Hazen Babcock and Eric Beer
	and Jayson Bowers and Ido Braslavsky and Marie Causey and Jennifer
	Colonell and James Dimeo and J. William Efcavitch and Eldar Giladi
	and Jaime Gill and John Healy and Mirna Jarosz and Dan Lapen and
	Keith Moulton and Stephen R Quake and Kathleen Steinmann and Edward
	Thayer and Anastasia Tyurina and Rebecca Ward and Howard Weiss and
	Zheng Xie},
  title = {Single-molecule {DNA} sequencing of a viral genome.},
  journal = {Science},
  year = {2008},
  volume = {320},
  pages = {106--109},
  month = {Apr},
  abstract = {The full promise of human genomics will be realized only when the
	genomes of thousands of individuals can be sequenced for comparative
	analysis. A reference sequence enables the use of short read length.
	We report an amplification-free method for determining the nucleotide
	sequence of more than 280,000 individual DNA molecules simultaneously.
	A DNA polymerase adds labeled nucleotides to surface-immobilized
	primer-template duplexes in stepwise fashion, and the asynchronous
	growth of individual DNA molecules was monitored by fluorescence
	imaging. Read lengths of >25 bases and equivalent phred software
	program quality scores approaching 30 were achieved. We used this
	method to sequence the M13 virus to an average depth of >150x and
	with 100\% coverage; thus, we resequenced the M13 genome with high-sensitivity
	mutation detection. This demonstrates a strategy for high-throughput
	low-cost resequencing.},
  
  institution = {Helicos BioSciences Corporation, One Kendall Square, Cambridge, MA
	02139, USA. tharris@helicosbio.com},
  keywords = {Algorithms; Bacteriophage M13; Computational Biology; DNA Primers;
	DNA, Viral; Genome, Viral; Mutation; Sequence Alignment; Sequence
	Analysis, DNA; Software; Templates, Genetic},
  owner = {calkan},
  pii = {320/5872/106},
  pmid = {18388294},
  timestamp = {2009.04.17},
}

@ARTICLE{Helbig2009,
  author = {Ingo Helbig and Heather C Mefford and Andrew J Sharp and Michel Guipponi
	and Marco Fichera and Andre Franke and Hiltrud Muhle and Carolien
	de Kovel and Carl Baker and Sarah von Spiczak and Katherine L Kron
	and Ines Steinich and Ailing A Kleefuss-Lie and Costin Leu and Verena
	Gaus and Bettina Schmitz and Karl M Klein and Philipp S Reif and
	Felix Rosenow and Yvonne Weber and Holger Lerche and Fritz Zimprich
	and Lydia Urak and Karoline Fuchs and Martha Feucht and Pierre Genton
	and Pierre Thomas and Frank Visscher and Gerrit-Jan de Haan and Rikke
	S Møller and Helle Hjalgrim and Daniela Luciano and Michael Wittig
	and Michael Nothnagel and Christian E Elger and Peter Nürnberg and
	Corrado Romano and Alain Malafosse and Bobby P C Koeleman and Dick
	Lindhout and Ulrich Stephani and Stefan Schreiber and Evan E Eichler
	and Thomas Sander},
  title = {15q13.3 microdeletions increase risk of idiopathic generalized epilepsy.},
  journal = {Nat Genet},
  year = {2009},
  volume = {41},
  pages = {160--162},
  month = {Feb},
  abstract = {We identified 15q13.3 microdeletions encompassing the CHRNA7 gene
	in 12 of 1,223 individuals with idiopathic generalized epilepsy (IGE),
	which were not detected in 3,699 controls (joint P = 5.32 x 10(-8)).
	Most deletion carriers showed common IGE syndromes without other
	features previously associated with 15q13.3 microdeletions, such
	as intellectual disability, autism or schizophrenia. Our results
	indicate that 15q13.3 microdeletions constitute the most prevalent
	risk factor for common epilepsies identified to date.},
  
  institution = {Department of Neuropediatrics, University Medical Center Schleswig-Holstein
	(Kiel Campus), Schwanenweg 20, 24105 Kiel, Germany.},
  keywords = {Adolescent; Adult; Case-Control Studies; Child; Child, Preschool;
	Chromosome Deletion; Chromosomes, Human, Pair 15; Comparative Genomic
	Hybridization; Epilepsy, Generalized; Female; Genetic Predisposition
	to Disease; Humans; Male; Receptors, Nicotinic; Risk Factors; Young
	Adult},
  owner = {calkan},
  pii = {ng.292},
  pmid = {19136953},
  timestamp = {2009.09.19},
}

@ARTICLE{Hillier2008,
  author = {LaDeana W Hillier and Gabor T Marth and Aaron R Quinlan and David
	Dooling and Ginger Fewell and Derek Barnett and Paul Fox and Jarret
	I Glasscock and Matthew Hickenbotham and Weichun Huang and Vincent
	J Magrini and Ryan J Richt and Sacha N Sander and Donald A Stewart
	and Michael Stromberg and Eric F Tsung and Todd Wylie and Tim Schedl
	and Richard K Wilson and Elaine R Mardis},
  title = {Whole-genome sequencing and variant discovery in {C. elegans}},
  journal = {Nat Methods},
  year = {2008},
  volume = {5},
  pages = {183--188},
  month = {Feb},
  abstract = {Massively parallel sequencing instruments enable rapid and inexpensive
	DNA sequence data production. Because these instruments are new,
	their data require characterization with respect to accuracy and
	utility. To address this, we sequenced a Caernohabditis elegans N2
	Bristol strain isolate using the Solexa Sequence Analyzer, and compared
	the reads to the reference genome to characterize the data and to
	evaluate coverage and representation. Massively parallel sequencing
	facilitates strain-to-reference comparison for genome-wide sequence
	variant discovery. Owing to the short-read-length sequences produced,
	we developed a revised approach to determine the regions of the genome
	to which short reads could be uniquely mapped. We then aligned Solexa
	reads from C. elegans strain CB4858 to the reference, and screened
	for single-nucleotide polymorphisms (SNPs) and small indels. This
	study demonstrates the utility of massively parallel short read sequencing
	for whole genome resequencing and for accurate discovery of genome-wide
	polymorphisms.},
  
  keywords = {Animals; Base Sequence; Caenorhabditis elegans; Chromosome Mapping;
	DNA Mutational Analysis; Molecular Sequence Data; Polymorphism, Single
	Nucleotide; Sequence Analysis, DNA; Variation (Genetics)},
  owner = {calkan},
  pii = {nmeth.1179},
  pmid = {18204455},
  timestamp = {2008.03.11},
}

@ARTICLE{Hoberman2009,
  author = {Rose Hoberman and Joana Dias and Bing Ge and Eef Harmsen and Michael
	Mayhew and Dominique J Verlaan and Tony Kwan and Ken Dewar and Mathieu
	Blanchette and Tomi Pastinen},
  title = {A probabilistic approach for {SNP} discovery in high-throughput human
	resequencing data.c},
  journal = {Genome Res},
  year = {2009},
  volume = {19},
  pages = {1542--1552},
  month = {Sep},
  abstract = {New high-throughput sequencing technologies are generating large amounts
	of sequence data, allowing the development of targeted large-scale
	resequencing studies. For these studies, accurate identification
	of polymorphic sites is crucial. Heterozygous sites are particularly
	difficult to identify, especially in regions of low coverage. We
	present a new strategy for identifying heterozygous sites in a single
	individual by using a machine learning approach that generates a
	heterozygosity score for each chromosomal position. Our approach
	also facilitates the identification of regions with unequal representation
	of two alleles and other poorly sequenced regions. The availability
	of confidence scores allows for a principled combination of sequencing
	results from multiple samples. We evaluate our method on a gold standard
	data genotype set from HapMap. We are able to classify sites in this
	data set as heterozygous or homozygous with 98.5\% accuracy. In de
	novo data our probabilistic heterozygote detection ("ProbHD") is
	able to identify 93\% of heterozygous sites at a <5\% false call
	rate (FCR) as estimated based on independent genotyping results.
	In direct comparison of ProbHD with high-coverage 1000 Genomes sequencing
	available for a subset of our data, we observe >99.9\% overall agreement
	for genotype calls and close to 90\% agreement for heterozygote calls.
	Overall, our data indicate that high-throughput resequencing of human
	genomic regions requires careful attention to systematic biases in
	sample preparation as well as sequence contexts, and that their impact
	can be alleviated by machine learning-based sequence analyses allowing
	more accurate extraction of true DNA variants.},
  
  institution = {McGill Centre for Bioinformatics, McGill University, Montréal H36
	0B1, Canada;},
  owner = {calkan},
  pii = {gr.092072.109},
  pmid = {19605794},
  timestamp = {2009.09.22},
}

@ARTICLE{Hollox2008,
  author = {Edward J Hollox and Ulrike Huffmeier and Patrick L J M Zeeuwen and
	Raquel Palla and Jesús Lascorz and Diana Rodijk-Olthuis and Peter
	C M van de Kerkhof and Heiko Traupe and Gys de Jongh and Martin den
	Heijer and André Reis and John A L Armour and Joost Schalkwijk},
  title = {Psoriasis is associated with increased beta-defensin genomic copy
	number.},
  journal = {Nat Genet},
  year = {2008},
  volume = {40},
  pages = {23--25},
  month = {Jan},
  abstract = {Psoriasis is a common inflammatory skin disease with a strong genetic
	component. We analyzed the genomic copy number polymorphism of the
	beta-defensin region on human chromosome 8 in 179 Dutch individuals
	with psoriasis and 272 controls and in 319 German individuals with
	psoriasis and 305 controls. Comparisons in both cohorts showed a
	significant association between higher genomic copy number for beta-defensin
	genes and risk of psoriasis.},
  
  institution = {Department of Genetics, University of Leicester, Leicester LE1 7RH,
	UK.},
  keywords = {Case-Control Studies; Chromosomes, Human, Pair 8; Gene Dosage; Genetic
	Predisposition to Disease; Humans; Polymorphism, Genetic; Polymorphism,
	Single Nucleotide; Psoriasis, genetics; beta-Defensins, genetics},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {ng.2007.48},
  pmid = {18059266},
  timestamp = {2011.07.22},
}

@ARTICLE{Hormozdiari2009,
  author = {Fereydoun Hormozdiari and Can Alkan and Evan E Eichler and S. Cenk
	Sahinal},
  title = {Combinatorial algorithms for structural variation detection in high-throughput
	sequenced genomes.},
  journal = {Genome Res},
  year = {2009},
  volume = {19},
  pages = {1270--1278},
  month = {Jul},
  abstract = {Recent studies show that along with single nucleotide polymorphisms
	and small indels, larger structural variants among human individuals
	are common. The Human Genome Structural Variation Project aims to
	identify and classify deletions, insertions, and inversions (>5 Kbp)
	in a small number of normal individuals with a fosmid-based paired-end
	sequencing approach using traditional sequencing technologies. The
	realization of new ultra-high-throughput sequencing platforms now
	makes it feasible to detect the full spectrum of genomic variation
	among many individual genomes, including cancer patients and others
	suffering from diseases of genomic origin. Unfortunately, existing
	algorithms for identifying structural variation (SV) among individuals
	have not been designed to handle the short read lengths and the errors
	implied by the "next-gen" sequencing (NGS) technologies. In this
	paper, we give combinatorial formulations for the SV detection between
	a reference genome sequence and a next-gen-based, paired-end, whole
	genome shotgun-sequenced individual. We describe efficient algorithms
	for each of the formulations we give, which all turn out to be fast
	and quite reliable; they are also applicable to all next-gen sequencing
	methods (Illumina, 454 Life Sciences [Roche], ABI SOLiD, etc.) and
	traditional capillary sequencing technology. We apply our algorithms
	to identify SV among individual genomes very recently sequenced by
	Illumina technology.},
  
  file = {main:Hormozdiari2009.pdf:PDF;supp:Hormozdiari2009-supp.pdf:PDF},
  institution = {School of Computing Science, Simon Fraser University, Burnaby, British
	Columbia, Canada V5A 1S6.},
  keywords = {Algorithms; Genetic Variation; Genome, Human; Humans; Sequence Analysis,
	DNA},
  owner = {calkan},
  pii = {gr.088633.108},
  pmid = {19447966},
  timestamp = {2009.09.18},
}

@ARTICLE{Hormozdiari2011a,
  author = {Fereydoun Hormozdiari and Can Alkan and Mario Ventura and Iman Hajirasouliha
	and Maika Malig and Faraz Hach and Deniz Yorukoglu and Phuong Dao
	and Marzieh Bakhshi and S. Cenk Sahinalp and Evan E Eichler},
  title = {Alu repeat discovery and characterization within human genomes.},
  journal = {Genome Res},
  year = {2011},
  volume = {21},
  pages = {840--849},
  month = {Jun},
  abstract = {Human genomes are now being rapidly sequenced, but not all forms of
	genetic variation are routinely characterized. In this study, we
	focus on Alu retrotransposition events and seek to characterize differences
	in the pattern of mobile insertion between individuals based on the
	analysis of eight human genomes sequenced using next-generation sequencing.
	Applying a rapid read-pair analysis algorithm, we discover 4342 Alu
	insertions not found in the human reference genome and show that
	98\% of a selected subset (63/64) experimentally validate. Of these
	new insertions, 89\% correspond to AluY elements, suggesting that
	they arose by retrotransposition. Eighty percent of the Alu insertions
	have not been previously reported and more novel events were detected
	in Africans when compared with non-African samples (76\% vs. 69\%).
	Using these data, we develop an experimental and computational screen
	to identify ancestry informative Alu retrotransposition events among
	different human populations.},
  
  file = {main:Hormozdiari2011a.pdf:PDF;supp_figs:Hormozdiari2011a-suppfigs.pdf:PDF;supp_tabs:Hormozdiari2011a-supptabs.xls:Excel},
  institution = {School of Computing Science, Simon Fraser University, Burnaby, British
	Columbia V5A 1S6, Canada;},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {gr.115956.110},
  pmid = {21131385},
  timestamp = {2011.06.17},
}

@ARTICLE{Hormozdiari2007,
  author = {Fereydoun Hormozdiari and Petra Berenbrink and Nataša Pr Ulj and
	S. Cenk Sahinalp},
  title = {Not All Scale-Free Networks Are Born Equal: The Role of the Seed
	Graph in {PPI} Network Evolution.},
  journal = {PLoS Comput Biol},
  year = {2007},
  volume = {3},
  pages = {e118},
  month = {Jul},
  abstract = {The (asymptotic) degree distributions of the best-known "scale-free"
	network models are all similar and are independent of the seed graph
	used; hence, it has been tempting to assume that networks generated
	by these models are generally similar. In this paper, we observe
	that several key topological features of such networks depend heavily
	on the specific model and the seed graph used. Furthermore, we show
	that starting with the "right" seed graph (typically a dense subgraph
	of the protein-protein interaction network analyzed), the duplication
	model captures many topological features of publicly available protein-protein
	interaction networks very well.},
  
  owner = {calkan},
  pii = {06-PLCB-RA-0519},
  pmid = {17616981},
  timestamp = {2008.03.05},
}

@ARTICLE{Hormozdiari2011,
  author = {Farhad Hormozdiari and Faraz Hach and S. Cenk Sahinalp and Evan E
	Eichler and Can Alkan},
  title = {Sensitive and fast mapping of di-base encoded reads.},
  journal = {Bioinformatics},
  year = {2011},
  volume = {27},
  pages = {1915--1921},
  month = {Jul},
  abstract = {Discovering variation among high-throughput sequenced genomes relies
	on efficient and effective mapping of sequence reads. The speed,
	sensitivity and accuracy of read mapping are crucial to determining
	the full spectrum of single nucleotide variants (SNVs) as well as
	structural variants (SVs) in the donor genomes analyzed.We present
	drFAST, a read mapper designed for di-base encoded 'color-space'
	sequences generated with the AB SOLiD platform. drFAST is specially
	designed for better delineation of structural variants, including
	segmental duplications, and is able to return all possible map locations
	and underlying sequence variation of short reads within a user-specified
	distance threshold. We show that drFAST is more sensitive in comparison
	to all commonly used aligners such as Bowtie, BFAST and SHRiMP. drFAST
	is also faster than both BFAST and SHRiMP and achieves a mapping
	speed comparable to Bowtie.The source code for drFAST is available
	at http://drfast.sourceforge.netcalkan@u.washington.edu.},
  
  file = {main:Hormozdiari2011.pdf:PDF},
  institution = {Department of Genome Sciences, Howard Hughes Medical Institute, University
	of Washington, Seattle, WA 98195-5065, USA and School of Computing
	Science, Simon Fraser University, Burnaby, BC, V5A 1S6, Canada.},
  keywords = {drfast},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {btr303},
  pmid = {21586516},
  timestamp = {2011.07.19},
}

@ARTICLE{Hormozdiari2010,
  author = {Fereydoun Hormozdiari and Iman Hajirasouliha and Phuong Dao and Faraz
	Hach and Deniz Yorukoglu and Can Alkan and Evan E Eichler and S.
	Cenk Sahinalp},
  title = {Next-generation {VariationHunter}: combinatorial algorithms for transposon
	insertion discovery.},
  journal = {Bioinformatics},
  year = {2010},
  volume = {26},
  pages = {i350--i357},
  month = {Jun},
  abstract = {Recent years have witnessed an increase in research activity for the
	detection of structural variants (SVs) and their association to human
	disease. The advent of next-generation sequencing technologies make
	it possible to extend the scope of structural variation studies to
	a point previously unimaginable as exemplified by the 1000 Genomes
	Project. Although various computational methods have been described
	for the detection of SVs, no such algorithm is yet fully capable
	of discovering transposon insertions, a very important class of SVs
	to the study of human evolution and disease. In this article, we
	provide a complete and novel formulation to discover both loci and
	classes of transposons inserted into genomes sequenced with high-throughput
	sequencing technologies. In addition, we also present 'conflict resolution'
	improvements to our earlier combinatorial SV detection algorithm
	(VariationHunter) by taking the diploid nature of the human genome
	into consideration. We test our algorithms with simulated data from
	the Venter genome (HuRef) and are able to discover >85\% of transposon
	insertion events with precision of >90\%. We also demonstrate that
	our conflict resolution algorithm (denoted as VariationHunter-CR)
	outperforms current state of the art (such as original VariationHunter,
	BreakDancer and MoDIL) algorithms when tested on the genome of the
	Yoruba African individual (NA18507). AVAILABILITY: The implementation
	of algorithm is available at http://compbio.cs.sfu.ca/strvar.htm.
	SUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics
	online.},
  
  file = {main:Hormozdiari2010.pdf:PDF;supp:Hormozdiari2010-supp.pdf:PDF},
  institution = {School of Computing Science, Simon Fraser University, Burnaby, BC,
	Canada.},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {btq216},
  pmid = {20529927},
  timestamp = {2010.09.15},
}

@ARTICLE{Hormozdiari2011b,
  author = {Fereydoun Hormozdiari and Iman Hajirasouliha and Andrew McPherson
	and Evan E Eichler and S. Cenk Sahinalp},
  title = {Simultaneous structural variation discovery among multiple paired-end
	sequenced genomes.},
  journal = {Genome Res},
  year = {2011},
  volume = {21},
  pages = {2203--2212},
  month = {Dec},
  abstract = {With the increasing popularity of whole-genome shotgun sequencing
	(WGSS) via high-throughput sequencing technologies, it is becoming
	highly desirable to perform comparative studies involving multiple
	individuals (from a specific population, race, or a group sharing
	a particular phenotype). The conventional approach for a comparative
	genome variation study involves two key steps: (1) each paired-end
	high-throughput sequenced genome is compared with a reference genome
	and its (structural) differences are identified; (2) the lists of
	structural variants in each genome are compared against each other.
	In this study we propose to move away from this two-step approach
	to a novel one in which all genomes are compared with the reference
	genome simultaneously for obtaining much higher accuracy in structural
	variation detection. For this purpose, we introduce the maximum parsimony-based
	simultaneous structural variation discovery problem for a set of
	high-throughput sequenced genomes and provide efficient algorithms
	to solve it. We compare the proposed framework with the conventional
	framework, on the genomes of the Yoruban mother-father-child trio,
	as well as the CEU trio of European ancestry (both sequenced by Illumina
	platforms). We observed that the conventional framework predicts
	an unexpectedly high number of de novo variations in the child in
	comparison to the parents and misses some of the known variations.
	Our proposed framework, on the other hand, not only significantly
	reduces the number of incorrectly predicted de novo variations but
	also predicts more of the known (true) variations.},
  
  file = {main:Hormozdiari2011b.pdf:PDF;supp:Hormozdiari2011b-supp.pdf:PDF},
  institution = {School of Computing Science, Simon Fraser University, Burnaby, BC
	V5A 1S6, Canada.},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {gr.120501.111},
  pmid = {22048523},
  timestamp = {2012.03.03},
}

@ELECTRONIC{SSE,
  author = {Intel},
  title = {Intel® SSE4 Programming Reference},
  url = {http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf}
}

@ARTICLE{IHGSC2001,
  author = {{International Human Genome Sequencing Consortium}},
  title = {Initial sequencing and analysis of the human genome.},
  journal = {Nature},
  year = {2001},
  volume = {409},
  pages = {860--921},
  month = {Feb},
  file = {main:IHGSC2001.pdf:PDF;supp:IHGSC2001-supp.doc:Word},
  keywords = {Animals; Chromosome Mapping; Conserved Sequence; CpG Islands; DNA
	Transposable Elements; Databases, Factual; Drug Industry; Evolution,
	Molecular; Forecasting; GC Rich Sequence; Gene Duplication; Genes;
	Genetic Diseases, Inborn; Genetics, Medical; Genome, Human; Human
	Genome Project; Humans; Mutation; Private Sector; Proteins, genetics;
	Proteome; Public Sector; RNA, genetics; Repetitive Sequences, Nucleic
	Acid; Sequence Analysis, DNA, methods; Species Specificity},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pmid = {11237011},
  review = {E. S. Lander and L. M. Linton and B. Birren and C. Nusbaum and M.
	C. Zody and J. Baldwin and K. Devon and K. Dewar and M. Doyle and
	W. FitzHugh and R. Funke and D. Gage and K. Harris and A. Heaford
	and J. Howland and L. Kann and J. Lehoczky and R. LeVine and P. McEwan
	and K. McKernan and J. Meldrim and J. P. Mesirov and C. Miranda and
	W. Morris and J. Naylor and C. Raymond and M. Rosetti and R. Santos
	and A. Sheridan and C. Sougnez and N. Stange-Thomann and N. Stojanovic
	and A. Subramanian and D. Wyman and J. Rogers and J. Sulston and
	R. Ainscough and S. Beck and D. Bentley and J. Burton and C. Clee
	and N. Carter and A. Coulson and R. Deadman and P. Deloukas and A.
	Dunham and I. Dunham and R. Durbin and L. French and D. Grafham and
	S. Gregory and T. Hubbard and S. Humphray and A. Hunt and M. Jones
	and C. Lloyd and A. McMurray and L. Matthews and S. Mercer and S.
	Milne and J. C. Mullikin and A. Mungall and R. Plumb and M. Ross
	and R. Shownkeen and S. Sims and R. H. Waterston and R. K. Wilson
	and L. W. Hillier and J. D. McPherson and M. A. Marra and E. R. Mardis
	and L. A. Fulton and A. T. Chinwalla and K. H. Pepin and W. R. Gish
	and S. L. Chissoe and M. C. Wendl and K. D. Delehaunty and T. L.
	Miner and A. Delehaunty and J. B. Kramer and L. L. Cook and R. S.
	Fulton and D. L. Johnson and P. J. Minx and S. W. Clifton and T.
	Hawkins and E. Branscomb and P. Predki and P. Richardson and S. Wenning
	and T. Slezak and N. Doggett and J. F. Cheng and A. Olsen and S.
	Lucas and C. Elkin and E. Uberbacher and M. Frazier and R. A. Gibbs
	and D. M. Muzny and S. E. Scherer and J. B. Bouck and E. J. Sodergren
	and K. C. Worley and C. M. Rives and J. H. Gorrell and M. L. Metzker
	and S. L. Naylor and R. S. Kucherlapati and D. L. Nelson and G. M.
	Weinstock and Y. Sakaki and A. Fujiyama and M. Hattori and T. Yada
	and A. Toyoda and T. Itoh and C. Kawagoe and H. Watanabe and Y. Totoki
	and T. Taylor and J. Weissenbach and R. Heilig and W. Saurin and
	F. Artiguenave and P. Brottier and T. Bruls and E. Pelletier and
	C. Robert and P. Wincker and D. R. Smith and L. Doucette-Stamm and
	M. Rubenfield and K. Weinstock and H. M. Lee and J. Dubois and A.
	Rosenthal and M. Platzer and G. Nyakatura and S. Taudien and A. Rump
	and H. Yang and J. Yu and J. Wang and G. Huang and J. Gu and L. Hood
	and L. Rowen and A. Madan and S. Qin and R. W. Davis and N. A. Federspiel
	and A. P. Abola and M. J. Proctor and R. M. Myers and J. Schmutz
	and M. Dickson and J. Grimwood and D. R. Cox and M. V. Olson and
	R. Kaul and C. Raymond and N. Shimizu and K. Kawasaki and S. Minoshima
	and G. A. Evans and M. Athanasiou and R. Schultz and B. A. Roe and
	F. Chen and H. Pan and J. Ramser and H. Lehrach and R. Reinhardt
	and W. R. McCombie and M. de la Bastide and N. Dedhia and H. Blöcker
	and K. Hornischer and G. Nordsiek and R. Agarwala and L. Aravind
	and J. A. Bailey and A. Bateman and S. Batzoglou and E. Birney and
	P. Bork and D. G. Brown and C. B. Burge and L. Cerutti and H. C.
	Chen and D. Church and M. Clamp and R. R. Copley and T. Doerks and
	S. R. Eddy and E. E. Eichler and T. S. Furey and J. Galagan and J.
	G. Gilbert and C. Harmon and Y. Hayashizaki and D. Haussler and H.
	Hermjakob and K. Hokamp and W. Jang and L. S. Johnson and T. A. Jones
	and S. Kasif and A. Kaspryzk and S. Kennedy and W. J. Kent and P.
	Kitts and E. V. Koonin and I. Korf and D. Kulp and D. Lancet and
	T. M. Lowe and A. McLysaght and T. Mikkelsen and J. V. Moran and
	N. Mulder and V. J. Pollara and C. P. Ponting and G. Schuler and
	J. Schultz and G. Slater and A. F. Smit and E. Stupka and J. Szustakowski
	and D. Thierry-Mieg and J. Thierry-Mieg and L. Wagner and J. Wallis
	and R. Wheeler and A. Williams and Y. I. Wolf and K. H. Wolfe and
	S. P. Yang and R. F. Yeh and F. Collins and M. S. Guyer and J. Peterson
	and A. Felsenfeld and K. A. Wetterstrand and A. Patrinos and M. J.
	Morgan and P. de Jong and J. J. Catanese and K. Osoegawa and H. Shizuya
	and S. Choi and Y. J. Chen and J. Szustakowki and International Human
	Genome Sequencing Consortium},
  timestamp = {2011.07.19},
}

@ARTICLE{Itsara2009,
  author = {Andy Itsara and Gregory M Cooper and Carl Baker and Santhosh Girirajan
	and Jun Li and Devin Absher and Ronald M Krauss and Richard M Myers
	and Paul M Ridker and Daniel I Chasman and Heather Mefford and Phyllis
	Ying and Deborah A Nickerson and Evan E Eichler},
  title = {Population analysis of large copy number variants and hotspots of
	human genetic disease.},
  journal = {Am J Hum Genet},
  year = {2009},
  volume = {84},
  pages = {148--161},
  month = {Feb},
  abstract = {Copy number variants (CNVs) contribute to human genetic and phenotypic
	diversity. However, the distribution of larger CNVs in the general
	population remains largely unexplored. We identify large variants
	in approximately 2500 individuals by using Illumina SNP data, with
	an emphasis on "hotspots" prone to recurrent mutations. We find variants
	larger than 500 kb in 5\%-10\% of individuals and variants greater
	than 1 Mb in 1\%-2\%. In contrast to previous studies, we find limited
	evidence for stratification of CNVs in geographically distinct human
	populations. Importantly, our sample size permits a robust distinction
	between truly rare and polymorphic but low-frequency copy number
	variation. We find that a significant fraction of individual CNVs
	larger than 100 kb are rare and that both gene density and size are
	strongly anticorrelated with allele frequency. Thus, although large
	CNVs commonly exist in normal individuals, which suggests that size
	alone can not be used as a predictor of pathogenicity, such variation
	is generally deleterious. Considering these observations, we combine
	our data with published CNVs from more than 12,000 individuals contrasting
	control and neurological disease collections. This analysis identifies
	known disease loci and highlights additional CNVs (e.g., 3q29, 16p12,
	and 15q25.2) for further investigation. This study provides one of
	the first analyses of large, rare (0.1\%-1\%) CNVs in the general
	population, with insights relevant to future analyses of genetic
	disease.},
  
  institution = {Department of Genome Sciences, School of Medicine, University of
	Washington, Seattle, WA 98195, USA.},
  keywords = {Gene Dosage; Gene Duplication; Genetic Diseases, Inborn; Genetic Variation;
	Genetics, Population; Genome, Human; Genotype; Geography; Humans;
	Oligonucleotide Array Sequence Analysis; Polymorphism, Genetic; Polymorphism,
	Single Nucleotide; Sequence Deletion},
  owner = {calkan},
  pii = {S0002-9297(09)00002-0},
  pmid = {19166990},
  timestamp = {2009.09.19},
}

@ARTICLE{Jeck2007,
  author = {William R Jeck and Josephine A Reinhardt and David A Baltrus and
	Matthew T Hickenbotham and Vincent Magrini and Elaine R Mardis and
	Jeffery L Dangl and Corbin D Jones},
  title = {Extending assembly of short {DNA} sequences to handle error},
  journal = {Bioinformatics},
  year = {2007},
  volume = {23},
  pages = {2942--2944},
  month = {Nov},
  abstract = {Inexpensive de novo genome sequencing, particularly in organisms with
	small genomes, is now possible using several new sequencing technologies.
	Some of these technologies such as that from Illumina's Solexa Sequencing,
	produce high genomic coverage by generating a very large number of
	small reads ( approximately 30 bp). While prior work shows that partial
	assembly can be performed by k-mer extension in error-free reads,
	this algorithm is unsuccessful with the sequencing error rates found
	in practice. We present VCAKE (Verified Consensus Assembly by K-mer
	Extension), a modification of simple k-mer extension that overcomes
	error by using high depth coverage. Though it is a simple modification
	of a previous approach, we show significant improvements in assembly
	results on simulated and experimental datasets that include error.
	AVAILABILITY: http://152.2.15.114/~labweb/VCAKE},
  
  keywords = {Algorithms; Artifacts; Base Sequence; Chromosome Mapping; Consensus
	Sequence; DNA; Molecular Sequence Data; Reproducibility of Results;
	Sensitivity and Specificity; Sequence Alignment; Sequence Analysis,
	DNA},
  owner = {calkan},
  pii = {btm451},
  pmid = {17893086},
  timestamp = {2008.03.02},
}

@ARTICLE{Johnson2001,
  author = {M. E. Johnson and L. Viggiano and J. A. Bailey and M. Abdul-Rauf
	and G. Goodwin and M. Rocchi and E. E. Eichler},
  title = {Positive selection of a gene family during the emergence of humans
	and African apes.},
  journal = {Nature},
  year = {2001},
  volume = {413},
  pages = {514--519},
  month = {Oct},
  abstract = {Gene duplication followed by adaptive evolution is one of the primary
	forces for the emergence of new gene function. Here we describe the
	recent proliferation, transposition and selection of a 20-kilobase
	(kb) duplicated segment throughout 15 Mb of the short arm of human
	chromosome 16. The dispersal of this segment was accompanied by considerable
	variation in chromosomal-map location and copy number among hominoid
	species. In humans, we identified a gene family (morpheus) within
	the duplicated segment. Comparison of putative protein-encoding exons
	revealed the most extreme case of positive selection among hominoids.
	The major episode of enhanced amino-acid replacement occurred after
	the separation of human and great-ape lineages from the orangutan.
	Positive selection continued to alter amino-acid composition after
	the divergence of human and chimpanzee lineages. The rapidity and
	bias for amino-acid-altering nucleotide changes suggest adaptive
	evolution of the morpheus gene family during the emergence of humans
	and African apes. Moreover, some genes emerge and evolve very rapidly,
	generating copies that bear little similarity to their ancestral
	precursors. Consequently, a small fraction of human genes may not
	possess discernible orthologues within the genomes of model organisms.},
  
  institution = {Department of Genetics and Center for Human Genetics, Case Western
	Reserve University School of Medicine and University Hospitals of
	Cleveland, Cleveland, Ohio 44106, USA.},
  keywords = {Animals; Chromosomes, Human, Pair 16; Evolution, Molecular; Gene Duplication;
	Hominidae, genetics; Humans; Molecular Sequence Data; Multigene Family;
	Phylogeny; Selection, Genetic; Sequence Analysis, DNA},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {35097067},
  pmid = {11586358},
  timestamp = {2011.07.19},
}

@ARTICLE{Karakoc2012,
  author = {Emre Karakoc and Can Alkan and Brian J O'Roak and Megan Y Dennis
	and Laura Vives and Kenneth Mark and Mark J Rieder and Debbie A Nickerson
	and Evan E Eichler},
  title = {Detection of structural variants and indels within exome data.},
  journal = {Nat Methods},
  year = {2012},
  volume = {9},
  pages = {176--178},
  abstract = {We report an algorithm to detect structural variation and indels from
	1 base pair (bp) to 1 Mbp within exome sequence data sets. Splitread
	uses one end-anchored placements to cluster the mappings of subsequences
	of unanchored ends to identify the size, content and location of
	variants with high specificity and sensitivity. The algorithm discovers
	indels, structural variants, de novo events and copy number-polymorphic
	processed pseudogenes missed by other methods.},
  
  file = {main:Karakoc2011.pdf:PDF;supp:Karakoc2011-supp.pdf:PDF},
  institution = {Department of Genome Sciences, University of Washington School of
	Medicine, Seattle, Washington, USA.},
  language = {eng},
  medline-pst = {epublish},
  owner = {calkan},
  pii = {nmeth.1810},
  pmid = {22179552},
  timestamp = {2012.03.03},
}

@ARTICLE{Karakoc2006,
  author = {Emre Karakoc and S. Cenk Sahinalp and Artem Cherkasov},
  title = {Comparative {QSAR}- and fragments distribution analysis of drugs,
	druglikes, metabolic substances, and antimicrobial compounds.},
  journal = {J Chem Inf Model},
  year = {2006},
  volume = {46},
  pages = {2167--2182},
  abstract = {A number of binary QSAR models have been developed using methods of
	artificial neural networks, k-nearest neighbors, linear discriminative
	analysis, and multiple linear regression and have been compared for
	their ability to recognize five types of chemical compounds that
	include conventional drugs, inactive druglikes, antimicrobial substituents,
	and bacterial and human metabolites. Thus, 20 binary classifiers
	have been created using a variety of 'inductive' and traditional
	2D QSAR descriptors which allowed up to 99\% accurate separation
	of the studied groups of activities. The comparison of the performance
	by four computational approaches demonstrated that the neural nets
	result in generally more accurate predictions, followed closely by
	k-nearest neighbors methods. It has also been demonstrated that complementation
	of 'inductive' descriptors with conventional QSAR parameters does
	not generally improve the quality of resulting solutions, conforming
	high predictive ability of 'inductive' variables. The conducted comparative
	QSAR analysis based on a novel linear optimization approach has helped
	to identify the extent of overlapping between the studied groups
	of compounds, such as cross-recognition of bacterial metabolites
	and antimicrobial compounds reflecting their immanent resemblance
	and similar origin. Human metabolites have been characterized as
	a very distinctive class of substances, separated from all other
	groups in the descriptors space and exhibiting different QSAR behavior.
	The analysis of unique structural fragments and substituents revealed
	inhomogeneous scale-free organization of human metabolites illustrating
	the fact that certain molecular scaffolds (such as sugars and nucleotides)
	may be strongly favored by natural evolution. The established scale-free
	organization of human metabolites has been contemplated as a factor
	of their unique positioning in the descriptors space and their distinctive
	QSAR properties. It is anticipated that the study may bring additional
	insight into QSAR determinants for conventional drugs, inactive chemicals,
	and metabolic substances and may help in rationalizing design and
	discovery of novel antimicrobials and human therapeutics with improved,
	metabolite-like properties.},
  
  keywords = {Anti-Infective Agents; Linear Models; Models, Molecular; Neural Networks
	(Computer); Pharmaceutical Pre; Quantitative Structure-Activity Relationship;
	parations},
  owner = {calkan},
  pmid = {16995747},
  timestamp = {2007.04.20},
}

@ARTICLE{Kent2002a,
  author = {W. James Kent},
  title = {{BLAT}--the {BLAST-like} alignment tool.},
  journal = {Genome Res},
  year = {2002},
  volume = {12},
  pages = {656--664},
  month = {Apr},
  abstract = {Analyzing vertebrate genomes requires rapid mRNA/DNA and cross-species
	protein alignments. A new tool, BLAT, is more accurate and 500 times
	faster than popular existing tools for mRNA/DNA alignments and 50
	times faster for protein alignments at sensitivity settings typically
	used when comparing vertebrate sequences. BLAT's speed stems from
	an index of all nonoverlapping K-mers in the genome. This index fits
	inside the RAM of inexpensive computers, and need only be computed
	once for each genome assembly. BLAT has several major stages. It
	uses the index to find regions in the genome likely to be homologous
	to the query sequence. It performs an alignment between homologous
	regions. It stitches together these aligned regions (often exons)
	into larger alignments (typically genes). Finally, BLAT revisits
	small internal exons possibly missed at the first stage and adjusts
	large gap boundaries that have canonical splice sites where feasible.
	This paper describes how BLAT was optimized. Effects on speed and
	sensitivity are explored for various K-mer sizes, mismatch schemes,
	and number of required index matches. BLAT is compared with other
	alignment programs on various test sets and then used in several
	genome-wide applications. http://genome.ucsc.edu hosts a web-based
	BLAT server for the human genome.},
  
  keywords = {Animals; Computational Biology; DNA; Humans; Mice; Protein Biosynthesis;
	Proteins; RNA, Messenger; Sequence Alignment; Software},
  owner = {calkan},
  pmid = {11932250},
  timestamp = {2008.03.04},
}

@ARTICLE{Khaja2006,
  author = {Razi Khaja and Junjun Zhang and Jeffrey R MacDonald and Yongshu He
	and Ann M Joseph-George and John Wei and Muhammad A Rafiq and Cheng
	Qian and Mary Shago and Lorena Pantano and Hiroyuki Aburatani and
	Keith Jones and Richard Redon and Matthew Hurles and Lluis Armengol
	and Xavier Estivill and Richard J Mural and Charles Lee and Stephen
	W Scherer and Lars Feuk},
  title = {Genome assembly comparison identifies structural variants in the
	human genome.},
  journal = {Nat Genet},
  year = {2006},
  volume = {38},
  pages = {1413--1418},
  month = {Dec},
  abstract = {Numerous types of DNA variation exist, ranging from SNPs to larger
	structural alterations such as copy number variants (CNVs) and inversions.
	Alignment of DNA sequence from different sources has been used to
	identify SNPs and intermediate-sized variants (ISVs). However, only
	a small proportion of total heterogeneity is characterized, and little
	is known of the characteristics of most smaller-sized (<50 kb) variants.
	Here we show that genome assembly comparison is a robust approach
	for identification of all classes of genetic variation. Through comparison
	of two human assemblies (Celera's R27c compilation and the Build
	35 reference sequence), we identified megabases of sequence (in the
	form of 13,534 putative non-SNP events) that were absent, inverted
	or polymorphic in one assembly. Database comparison and laboratory
	experimentation further demonstrated overlap or validation for 240
	variable regions and confirmed >1.5 million SNPs. Some differences
	were simple insertions and deletions, but in regions containing CNVs,
	segmental duplication and repetitive DNA, they were more complex.
	Our results uncover substantial undescribed variation in humans,
	highlighting the need for comprehensive annotation strategies to
	fully interpret genome scanning and personalized sequencing projects.},
  
  keywords = {Base Seq; DNA; Genome, Human; Genomics; Humans; In Situ Hybridization,
	Fluorescence; Polymerase Chain Reaction; Sequence Alignment; Variation
	(Genetics); uence},
  owner = {calkan},
  pii = {ng1921},
  pmid = {17115057},
  timestamp = {2007.05.11},
}

@ARTICLE{Kidd2008,
  author = {Jeffrey M Kidd and Gregory M Cooper and William F Donahue and Hillary
	S Hayden and Nick Sampas and Tina Graves and Nancy Hansen and Brian
	Teague and Can Alkan and Francesca Antonacci and Eric Haugen and
	Troy Zerr and N. Alice Yamada and Peter Tsang and Tera L Newman and
	Eray Tüzün and Ze Cheng and Heather M Ebling and Nadeem Tusneem and
	Robert David and Will Gillett and Karen A Phelps and Molly Weaver
	and David Saranga and Adrianne Brand and Wei Tao and Erik Gustafson
	and Kevin McKernan and Lin Chen and Maika Malig and Joshua D Smith
	and Joshua M Korn and Steven A McCarroll and David A Altshuler and
	Daniel A Peiffer and Michael Dorschner and John Stamatoyannopoulos
	and David Schwartz and Deborah A Nickerson and James C Mullikin and
	Richard K Wilson and Laurakay Bruhn and Maynard V Olson and Rajinder
	Kaul and Douglas R Smith and Evan E Eichler},
  title = {Mapping and sequencing of structural variation from eight human genomes.},
  journal = {Nature},
  year = {2008},
  volume = {453},
  pages = {56--64},
  month = {May},
  abstract = {Genetic variation among individual humans occurs on many different
	scales, ranging from gross alterations in the human karyotype to
	single nucleotide changes. Here we explore variation on an intermediate
	scale--particularly insertions, deletions and inversions affecting
	from a few thousand to a few million base pairs. We employed a clone-based
	method to interrogate this intermediate structural variation in eight
	individuals of diverse geographic ancestry. Our analysis provides
	a comprehensive overview of the normal pattern of structural variation
	present in these genomes, refining the location of 1,695 structural
	variants. We find that 50\% were seen in more than one individual
	and that nearly half lay outside regions of the genome previously
	described as structurally variant. We discover 525 new insertion
	sequences that are not present in the human reference genome and
	show that many of these are variable in copy number between individuals.
	Complete sequencing of 261 structural variants reveals considerable
	locus complexity and provides insights into the different mutational
	processes that have shaped the human genome. These data provide the
	first high-resolution sequence map of human structural variation--a
	standard for genotyping platforms and a prelude to future individual
	genome sequencing projects.},
  
  file = {main:Kidd2008.pdf:PDF;supp:Kidd2008-supp.pdf:PDF},
  institution = {Department of Genome Sciences and Howard Hughes Medical Institute,
	University of Washington, Seattle, Washington 98195, USA.},
  owner = {calkan},
  pii = {nature06862},
  pmid = {18451855},
  timestamp = {2008.05.16},
}

@ARTICLE{Kidd2010,
  author = {Jeffrey M Kidd and Nick Sampas and Francesca Antonacci and Tina Graves
	and Robert Fulton and Hillary S Hayden and Can Alkan and Maika Malig
	and Mario Ventura and Giuliana Giannuzzi and Joelle Kallicki and
	Paige Anderson and Anya Tsalenko and N. Alice Yamada and Peter Tsang
	and Rajinder Kaul and Richard K Wilson and Laurakay Bruhn and Evan
	E Eichler},
  title = {Characterization of missing human genome sequences and copy-number
	polymorphic insertions.},
  journal = {Nat Methods},
  year = {2010},
  volume = {7},
  pages = {365--371},
  month = {May},
  abstract = {The extent of human genomic structural variation suggests that there
	must be portions of the genome yet to be discovered, annotated and
	characterized at the sequence level. We present a resource and analysis
	of 2,363 new insertion sequences corresponding to 720 genomic loci.
	We found that a substantial fraction of these sequences are either
	missing, fragmented or misassigned when compared to recent de novo
	sequence assemblies from short-read next-generation sequence data.
	We determined that 18-37\% of these new insertions are copy-number
	polymorphic, including loci that show extensive population stratification
	among Europeans, Asians and Africans. Complete sequencing of 156
	of these insertions identified new exons and conserved noncoding
	sequences not yet represented in the reference genome. We developed
	a method to accurately genotype these new insertions by mapping next-generation
	sequencing datasets to the breakpoint, thereby providing a means
	to characterize copy-number status for regions previously inaccessible
	to single-nucleotide polymorphism microarrays.},
  file = {main:Kidd2010.pdf:PDF;supp:Kidd2010-supp.pdf:PDF},
  institution = {Department of Genome Sciences, University of Washington School of
	Medicine, Seattle, USA.},
  keywords = {Contig Mapping, methods; DNA Transposable Elements, genetics; Gene
	Frequency; Genome, Human; Genomic Structural Variation, genetics;
	Genotype; Humans; In Situ Hybridization, Fluorescence; Molecular
	Sequence Data; Polymorphism, Genetic; Sequence Analysis, DNA, methods},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pmid = {20440878},
  timestamp = {2011.07.19},
}

@ARTICLE{Kitzman2011,
  author = {Jacob O Kitzman and Alexandra P Mackenzie and Andrew Adey and Joseph
	B Hiatt and Rupali P Patwardhan and Peter H Sudmant and Sarah B Ng
	and Can Alkan and Ruolan Qiu and Evan E Eichler and Jay Shendure},
  title = {Haplotype-resolved genome sequencing of a {Gujarati Indian} individual.},
  journal = {Nat Biotechnol},
  year = {2011},
  volume = {29},
  pages = {59--63},
  month = {Jan},
  abstract = {Haplotype information is essential to the complete description and
	interpretation of genomes, genetic diversity and genetic ancestry.
	Although individual human genome sequencing is increasingly routine,
	nearly all such genomes are unresolved with respect to haplotype.
	Here we combine the throughput of massively parallel sequencing with
	the contiguity information provided by large-insert cloning to experimentally
	determine the haplotype-resolved genome of a South Asian individual.
	A single fosmid library was split into a modest number of pools,
	each providing ∼3\% physical coverage of the diploid genome. Sequencing
	of each pool yielded reads overwhelmingly derived from only one homologous
	chromosome at any given location. These data were combined with whole-genome
	shotgun sequence to directly phase 94\% of ascertained heterozygous
	single nucleotide polymorphisms (SNPs) into long haplotype blocks
	(N50 of 386 kilobases (kbp)). This method also facilitates the analysis
	of structural variation, for example, to anchor novel insertions
	to specific locations and haplotypes.},
  
  file = {main:Kitzman2011a.pdf:PDF;supp:Kitzman2011a-supp.pdf:PDF;supp_tab4:Kitzman2011-supptab4.xls:Excel},
  institution = {Department of Genome Sciences, University of Washington School of
	Medicine, Seattle, Washington, USA. kitz@uw.edu},
  keywords = {Asian Continental Ancestry Group, genetics; Base Sequence; Cell Line;
	Genome, Human, genetics; Haplotypes, genetics; Heterozygote; High-Throughput
	Nucleotide Sequencing, methods; Humans; Models, Molecular; Polymorphism,
	Single Nucleotide, genetics; Sequence Analysis, DNA, methods},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {nbt.1740},
  pmid = {21170042},
  timestamp = {2011.06.17},
}

@ARTICLE{Korbel2009,
  author = {Jan Korbel and Alexej Abyzov and Xinmeng Mu and Nicholas Carriero
	and Philip Cayting and Zhengdong Zhang and Michael Snyder and Mark
	Gerstein},
  title = {{PEMer:} a computational framework with simulation-based error models
	for inferring genomic structural variants from massive paired-end
	sequencing data.},
  journal = {Genome Biol},
  year = {2009},
  volume = {10},
  pages = {R23},
  month = {Feb},
  abstract = {ABSTRACT: Personal-genomics endeavors, such as the 1000 Genomes project,
	are generating maps of genomic structural variants by analyzing ends
	of massively sequenced genome fragments. To process these we developed
	Paired-End Mapper (PEMer; http://sv.gersteinlab.org/pemer). This
	comprises an analysis pipeline, compatible with several next-generation
	sequencing platforms; simulation-based error models, yielding confidence-values
	for each structural variant; and a back-end database. The simulations
	demonstrated high structural variant reconstruction efficiency for
	PEMer's coverage-adjusted multi-cutoff scoring-strategy and showed
	its relative insensitivity to base-calling errors.},
  
  institution = {Gene Expression Unit, European Molecular Biology Laboratory (EMBL),
	Meyerhofstr,, Heidelberg, 69117, Germany. korbel@embl.de.},
  owner = {calkan},
  pii = {gb-2009-10-2-r23},
  pmid = {19236709},
  timestamp = {2009.09.22},
}

@ARTICLE{Korbel2007,
  author = {Jan O Korbel and Alexander Eckehart Urban and Jason P Affourtit and
	Brian Godwin and Fabian Grubert and Jan Fredrik Simons and Philip
	M Kim and Dean Palejev and Nicholas J Carriero and Lei Du and Bruce
	E Taillon and Zhoutao Chen and Andrea Tanzer and A. C Eugenia Saunders
	and Jianxiang Chi and Fengtang Yang and Nigel P Carter and Matthew
	E Hurles and Sherman M Weissman and Timothy T Harkins and Mark B
	Gerstein and Michael Egholm and Michael Snyder},
  title = {Paired-end mapping reveals extensive structural variation in the
	human genome},
  journal = {Science},
  year = {2007},
  volume = {318},
  pages = {420--426},
  month = {Oct},
  abstract = {Structural variation of the genome involves kilobase- to megabase-sized
	deletions, duplications, insertions, inversions, and complex combinations
	of rearrangements. We introduce high-throughput and massive paired-end
	mapping (PEM), a large-scale genome-sequencing method to identify
	structural variants (SVs) approximately 3 kilobases (kb) or larger
	that combines the rescue and capture of paired ends of 3-kb fragments,
	massive 454 sequencing, and a computational approach to map DNA reads
	onto a reference genome. PEM was used to map SVs in an African and
	in a putatively European individual and identified shared and divergent
	SVs relative to the reference genome. Overall, we fine-mapped more
	than 1000 SVs and documented that the number of SVs among humans
	is much larger than initially hypothesized; many of the SVs potentially
	affect gene function. The breakpoint junction sequences of more than
	200 SVs were determined with a novel pooling strategy and computational
	analysis. Our analysis provided insights into the mechanisms of SV
	formation in humans.},
  
  keywords = {Chromosome Mapping; Computational Biology; Female; Gene Fusion; Genome,
	Human; Humans; Inversion, Chromosome; Mutagenesis, Insertional; Mutation;
	Oligonucleotide Array Sequence Analysis; Recombination, Genetic;
	Repetitive Sequences, Nucleic Acid; Retroelements; Sequence Analysis,
	DNA; Sequence Deletion; Variation (Genetics)},
  owner = {calkan},
  pii = {1149504},
  pmid = {17901297},
  timestamp = {2008.02.06},
}

@ARTICLE{Korlach2008,
  author = {Jonas Korlach and Patrick J Marks and Ronald L Cicero and Jeremy
	J Gray and Devon L Murphy and Daniel B Roitman and Thang T Pham and
	Geoff A Otto and Mathieu Foquet and Stephen W Turner},
  title = {Selective aluminum passivation for targeted immobilization of single
	DNA polymerase molecules in zero-mode waveguide nanostructures.},
  journal = {Proc Natl Acad Sci U S A},
  year = {2008},
  volume = {105},
  pages = {1176--1181},
  month = {Jan},
  abstract = {Optical nanostructures have enabled the creation of subdiffraction
	detection volumes for single-molecule fluorescence microscopy. Their
	applicability is extended by the ability to place molecules in the
	confined observation volume without interfering with their biological
	function. Here, we demonstrate that processive DNA synthesis thousands
	of bases in length was carried out by individual DNA polymerase molecules
	immobilized in the observation volumes of zero-mode waveguides (ZMWs)
	in high-density arrays. Selective immobilization of polymerase to
	the fused silica floor of the ZMW was achieved by passivation of
	the metal cladding surface using polyphosphonate chemistry, producing
	enzyme density contrasts of glass over aluminum in excess of 400:1.
	Yields of single-molecule occupancies of approximately 30\% were
	obtained for a range of ZMW diameters (70-100 nm). Results presented
	here support the application of immobilized single DNA polymerases
	in ZMW arrays for long-read-length DNA sequencing.},
  
  institution = {Pacific Biosciences, 1505 Adams Drive, Menlo Park, CA 94025, USA.},
  keywords = {Aluminum; DNA, Circular; DNA-Directed RNA Polymerases; Enzymes, Immobilized;
	Glass; Microscopy, Fluorescence; Nanostructures; Optics and Photonics;
	Phosphonic Acids; Polyvinyls; Protein Array Analysis; Surface Properties;
	Templates, Genetic},
  owner = {calkan},
  pii = {0710982105},
  pmid = {18216253},
  timestamp = {2009.04.12},
}

@ARTICLE{Korn2008,
  author = {Joshua M Korn and Finny G Kuruvilla and Steven A McCarroll and Alec
	Wysoker and James Nemesh and Simon Cawley and Earl Hubbell and Jim
	Veitch and Patrick J Collins and Katayoon Darvishi and Charles Lee
	and Marcia M Nizzari and Stacey B Gabriel and Shaun Purcell and Mark
	J Daly and David Altshuler},
  title = {Integrated genotype calling and association analysis of {SNP}s, common
	copy number polymorphisms and rare {CNV}s.},
  journal = {Nat Genet},
  year = {2008},
  volume = {40},
  pages = {1253--1260},
  month = {Oct},
  abstract = {Accurate and complete measurement of single nucleotide (SNP) and copy
	number (CNV) variants, both common and rare, will be required to
	understand the role of genetic variation in disease. We present Birdsuite,
	a four-stage analytical framework instantiated in software for deriving
	integrated and mutually consistent copy number and SNP genotypes.
	The method sequentially assigns copy number across regions of common
	copy number polymorphisms (CNPs), calls genotypes of SNPs, identifies
	rare CNVs via a hidden Markov model (HMM), and generates an integrated
	sequence and copy number genotype at every locus (for example, including
	genotypes such as A-null, AAB and BBB in addition to AA, AB and BB
	calls). Such genotypes more accurately depict the underlying sequence
	of each individual, reducing the rate of apparent mendelian inconsistencies.
	The Birdsuite software is applied here to data from the Affymetrix
	SNP 6.0 array. Additionally, we describe a method, implemented in
	PLINK, to utilize these combined SNP and CNV genotypes for association
	testing with a phenotype.},
  
  institution = {Broad Institute of Harvard and Massachusetts Institute of Technology,
	Cambridge, Massachusetts 02142, USA. jkorn@broad.mit.edu},
  keywords = {Algorithms; Chromosomes, Human; Chromosomes, Human, Pair 4; DNA; Female;
	Gene Dosage; Genome, Human; Genotype; Haplotypes; Humans; Male; Markov
	Chains; Models, Statistical; Oligonucleotide Array Sequence Analysis;
	Polymerase Chain Reaction; Polymorphism, Single Nucleotide; Software},
  owner = {calkan},
  pii = {ng.237},
  pmid = {18776909},
  timestamp = {2009.09.20},
}

@ARTICLE{Lander1987,
  author = {E. S. Lander and D. Botstein},
  title = {Homozygosity mapping: a way to map human recessive traits with the
	DNA of inbred children.},
  journal = {Science},
  year = {1987},
  volume = {236},
  pages = {1567--1570},
  month = {Jun},
  abstract = {An efficient strategy for mapping human genes that cause recessive
	traits has been devised that uses mapped restriction fragment length
	polymorphisms (RFLPs) and the DNA of affected children from consanguineous
	marriages. The method involves detection of the disease locus by
	virtue of the fact that the adjacent region will preferentially be
	homozygous by descent in such inbred children. A single affected
	child of a first-cousin marriage is shown to contain the same total
	information about linkage as a nuclear family with three affected
	children. Calculations show that it should be practical to map a
	recessive disease gene by studying DNA from fewer than a dozen unrelated,
	affected inbred children, given a complete RFLP linkage map. The
	method should make it possible to map many recessive diseases for
	which it is impractical or impossible to collect adequate numbers
	of families with multiple affected offspring.},
  keywords = {Chromosome Mapping, methods; Consanguinity; DNA, genetics; Genes,
	Recessive; Genetic Diseases, Inborn, genetics; Genetic Linkage; Homozygote;
	Pedigree; Polymorphism, Restriction Fragment Length},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pmid = {2884728},
  timestamp = {2012.03.15}
}

@ARTICLE{Langmead2009,
  author = {Ben Langmead and Cole Trapnell and Mihai Pop and Steven Salzberg},
  title = {Ultrafast and memory-efficient alignment of short {DNA} sequences
	to the human genome.},
  journal = {Genome Biol},
  year = {2009},
  volume = {10},
  pages = {R25},
  month = {Mar},
  abstract = {ABSTRACT: Bowtie is an ultrafast, memory-efficient alignment program
	for aligning short DNA sequence reads to large genomes. For the human
	genome, Burrows-Wheeler indexing allows Bowtie to align more than
	25 million reads per CPU hour with a memory footprint of approximately
	1.3 gigabytes. Bowtie extends previous Burrows-Wheeler techniques
	with a novel quality-aware backtracking algorithm that permits mismatches.
	Multiple processor cores can be used simultaneously to achieve even
	greater alignment speeds. Bowtie is open source http://bowtie.cbcb.umd.edu.},
  
  institution = {Center for Bioinformatics and Computational Biology, Institute for
	Advanced Computer Studies, University of Maryland, College Park,
	MD 20742, USA. langmead@cs.umd.edu.},
  owner = {calkan},
  pii = {gb-2009-10-3-r25},
  pmid = {19261174},
  timestamp = {2009.04.07},
}

@ARTICLE{Lee2008,
  author = {Seunghak Lee and Elango Cheran and Michael Brudno},
  title = {A robust framework for detecting structural variations in a genome.},
  journal = {Bioinformatics},
  year = {2008},
  volume = {24},
  pages = {i59--i67},
  month = {Jul},
  abstract = {MOTIVATION: Recently, structural genomic variants have come to the
	forefront as a significant source of variation in the human population,
	but the identification of these variants in a large genome remains
	a challenge. The complete sequencing of a human individual is prohibitive
	at current costs, while current polymorphism detection technologies,
	such as SNP arrays, are not able to identify many of the large scale
	events. One of the most promising methods to detect such variants
	is the computational mapping of clone-end sequences to a reference
	genome. RESULTS: Here, we present a probabilistic framework for the
	identification of structural variants using clone-end sequencing.
	Unlike previous methods, our approach does not rely on an a priori
	determined mapping of all reads to the reference. Instead, we build
	a framework for finding the most probable assignment of sequenced
	clones to potential structural variants based on the other clones.
	We compare our predictions with the structural variants identified
	in three previous studies. While there is a statistically significant
	correlation between the predictions, we also find a significant number
	of previously uncharacterized structural variants. Furthermore, we
	identify a number of putative cross-chromosomal events, primarily
	located proximally to the centromeres of the chromosomes. AVAILABILITY:
	Our dataset, results and source code are available at http://compbio.cs.toronto.edu/structvar/.},
  
  institution = {Department of Computer Science, University of Toronto, Toronto, ON
	M5S 3G4, Canada. seunghak@cs.toronto.edu},
  keywords = {Algorithms; Base Sequence; Chromosome Mapping; Computer Simulation;
	Genome; Models, Genetic; Models, Statistical; Molecular Sequence
	Data; Sequence Analysis, DNA; Variation (Genetics)},
  owner = {calkan},
  pii = {btn176},
  pmid = {18586745},
  timestamp = {2008.09.29},
}

@ARTICLE{Lee2009,
  author = {Seunghak Lee and Fereydoun Hormozdiari and Can Alkan and Michael
	Brudno},
  title = {{MoDIL}: detecting small indels from clone-end sequencing with mixtures
	of distributions.},
  journal = {Nat Methods},
  year = {2009},
  volume = {6},
  pages = {473--474},
  month = {Jul},
  
  file = {main:Lee2009.pdf:PDF;supp:Lee2009-supp.pdf:PDF},
  keywords = {Algorithms; DNA Mutational Analysis; Databases, Nucleic Acid; Gene
	Library; Genetic Variation; Humans; INDEL Mutation; Software},
  owner = {calkan},
  pii = {nmeth.f.256},
  pmid = {19483690},
  timestamp = {2009.09.18},
}

@ARTICLE{Lee2011,
  author = {Sherry Lee and Kelly G Paulson and Elizabeth P Murchison and Olga
	K Afanasiev and Can Alkan and J. Helen Leonard and David R Byrd and
	Gregory J Hannon and Paul Nghiem},
  title = {Identification and validation of a novel mature microRNA encoded
	by the Merkel cell polyomavirus in human Merkel cell carcinomas.},
  journal = {J Clin Virol},
  year = {2011},
  volume = {52},
  pages = {272--275},
  month = {Nov},
  abstract = {Merkel cell polyomavirus (MCPyV) is present in approximately 80\%
	of human Merkel cell carcinomas (MCCs). A previous in silico prediction
	suggested MCPyV encodes a microRNA (miRNA) that may regulate cellular
	and viral genes.To determine the presence and prevalence of a putative
	MCPyV-encoded miRNA in human MCC tumors.Over 30 million small RNAs
	from 7 cryopreserved MCC tumors and 1 perilesional sample were sequenced.
	45 additional MCC tumors were examined for expression of an MCPyV-encoded
	mature miRNA by reverse transcription real-time PCR.An MCPyV-encoded
	mature miRNA, "MCV-miR-M1-5p", was detected by direct sequencing
	in 2 of 3 MCPyV-positive MCC tumors. Although a precursor miRNA,
	MCV-miR-M1, had been predicted in silico and studied in vitro by
	Seo et al., no MCPyV-encoded miRNAs have been directly detected in
	human tissues. Importantly, the mature sequence of MCV-miR-M1 found
	in vivo was identical in all 79 reads obtained but differed from
	the in silico predicted mature miRNA by a 2-nucleotide shift, resulting
	in a distinct seed region and a different set of predicted target
	genes. This mature miRNA was detected by real-time PCR in 50\% of
	MCPyV-positive MCCs (n=38) and in 0\% of MCPyV-negative MCCs (n=13).MCV-miR-M1-5p
	is expressed at low levels in 50\% of MCPyV-positive MCCs. This virus-encoded
	miRNA is predicted to target genes that may play a role in promoting
	immune evasion and regulating viral DNA replication.},
  
  file = {main:Lee2011.pdf:PDF;supp:Lee2011supp.pdf:PDF},
  institution = {Department of Medicine/Dermatology, University of Washington, Seattle,
	WA, 98109, USA.},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {S1386-6532(11)00329-5},
  pmid = {21907614},
  timestamp = {2011.11.08},
}

@ARTICLE{levenshtein1966,
  author = {Vladimir I. Levenshtein},
  title = {Binary codes capable of correcting deletions, insertions, and reversals},
  journal = {Soviet Physics Doklady},
  year = {1966}
}

@ARTICLE{Levy2007,
  author = {Samuel Levy and Granger Sutton and Pauline C Ng and Lars Feuk and
	Aaron L Halpern and Brian P Walenz and Nelson Axelrod and Jiaqi Huang
	and Ewen F Kirkness and Gennady Denisov and Yuan Lin and Jeffrey
	R MacDonald and Andy Wing Chun Pang and Mary Shago and Timothy B
	Stockwell and Alexia Tsiamouri and Vineet Bafna and Vikas Bansal
	and Saul A Kravitz and Dana A Busam and Karen Y Beeson and Tina C
	McIntosh and Karin A Remington and Josep F Abril and John Gill and
	Jon Borman and Yu-Hui Rogers and Marvin E Frazier and Stephen W Scherer
	and Robert L Strausberg and J. Craig Venter},
  title = {The diploid genome sequence of an individual human.},
  journal = {PLoS Biol},
  year = {2007},
  volume = {5},
  pages = {e254},
  month = {Sep},
  abstract = {Presented here is a genome sequence of an individual human. It was
	produced from approximately 32 million random DNA fragments, sequenced
	by Sanger dideoxy technology and assembled into 4,528 scaffolds,
	comprising 2,810 million bases (Mb) of contiguous sequence with approximately
	7.5-fold coverage for any given region. We developed a modified version
	of the Celera assembler to facilitate the identification and comparison
	of alternate alleles within this individual diploid genome. Comparison
	of this genome and the National Center for Biotechnology Information
	human reference assembly revealed more than 4.1 million DNA variants,
	encompassing 12.3 Mb. These variants (of which 1,288,319 were novel)
	included 3,213,401 single nucleotide polymorphisms (SNPs), 53,823
	block substitutions (2-206 bp), 292,102 heterozygous insertion/deletion
	events (indels)(1-571 bp), 559,473 homozygous indels (1-82,711 bp),
	90 inversions, as well as numerous segmental duplications and copy
	number variation regions. Non-SNP DNA variation accounts for 22\%
	of all events identified in the donor, however they involve 74\%
	of all variant bases. This suggests an important role for non-SNP
	genetic alterations in defining the diploid genome structure. Moreover,
	44\% of genes were heterozygous for one or more variants. Using a
	novel haplotype assembly strategy, we were able to span 1.5 Gb of
	genome sequence in segments >200 kb, providing further precision
	to the diploid nature of the genome. These data depict a definitive
	molecular portrait of a diploid human genome that provides a starting
	point for future genome comparisons and enables an era of individualized
	genomic information.},
  
  owner = {calkan},
  pii = {07-PLBI-RA-1258},
  pmid = {17803354},
  timestamp = {2008.03.03},
}

@ARTICLE{soap2,
  author = {Li and et al.},
  title = {SOAP2: an improved ultrafast tool for short read alignment},
  journal = {Bioinformatics},
  year = {2009}
}

@ARTICLE{Li2011a,
  author = {Heng Li},
  title = {Tabix: fast retrieval of sequence features from generic {TAB}-delimited
	files.},
  journal = {Bioinformatics},
  year = {2011},
  volume = {27},
  pages = {718--719},
  month = {Mar},
  abstract = {Tabix is the first generic tool that indexes position sorted files
	in TAB-delimited formats such as GFF, BED, PSL, SAM and SQL export,
	and quickly retrieves features overlapping specified regions. Tabix
	features include few seek function calls per query, data compression
	with gzip compatibility and direct FTP/HTTP access. Tabix is implemented
	as a free command-line tool as well as a library in C, Java, Perl
	and Python. It is particularly useful for manually examining local
	genomic features on the command line and enables genome viewers to
	support huge data files and remote custom tracks over networks. AVAILABILITY
	AND IMPLEMENTATION: http://samtools.sourceforge.net.},
  
  institution = {Program in Medical Population Genetics, The Broad Institute of Harvard
	and MIT, Cambridge, MA 02142, USA. hengli@broadinstitute.org},
  keywords = {Algorithms; Computational Biology, methods; Genome; Genomics, methods;
	Information Storage and Retrieval, methods; Sequence Analysis, DNA,
	methods; Software},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {btq671},
  pmid = {21208982},
  timestamp = {2011.07.20},
}

@ARTICLE{Li2010a,
  author = {Heng Li and Richard Durbin},
  title = {Fast and accurate long-read alignment with {Burrows-Wheeler} transform.},
  journal = {Bioinformatics},
  year = {2010},
  volume = {26},
  pages = {589--595},
  month = {Mar},
  abstract = {Many programs for aligning short sequencing reads to a reference genome
	have been developed in the last 2 years. Most of them are very efficient
	for short reads but inefficient or not applicable for reads >200
	bp because the algorithms are heavily and specifically tuned for
	short queries with low sequencing error rate. However, some sequencing
	platforms already produce longer reads and others are expected to
	become available soon. For longer reads, hashing-based software such
	as BLAT and SSAHA2 remain the only choices. Nonetheless, these methods
	are substantially slower than short-read aligners in terms of aligned
	bases per unit time.We designed and implemented a new algorithm,
	Burrows-Wheeler Aligner's Smith-Waterman Alignment (BWA-SW), to align
	long sequences up to 1 Mb against a large sequence database (e.g.
	the human genome) with a few gigabytes of memory. The algorithm is
	as accurate as SSAHA2, more accurate than BLAT, and is several to
	tens of times faster than both.http://bio-bwa.sourceforge.net},
  
  file = {main:Li2010a.pdf:PDF},
  institution = {Wellcome Trust Sanger Institute, Wellcome Genome Campus, Cambridge,
	CB10 1SA, UK.},
  keywords = {Algorithms; Base Sequence; Genome, Human; Genomics, methods; Humans;
	Sequence Alignment, methods; Sequence Analysis, DNA},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {btp698},
  pmid = {20080505},
  timestamp = {2011.07.19},
}

@ARTICLE{Li2009a,
  author = {Heng Li and Richard Durbin},
  title = {Fast and accurate short read alignment with {Burrows-Wheeler} transform.},
  journal = {Bioinformatics},
  year = {2009},
  volume = {25},
  pages = {1754--1760},
  month = {Jul},
  abstract = {MOTIVATION: The enormous amount of short reads generated by the new
	DNA sequencing technologies call for the development of fast and
	accurate read alignment programs. A first generation of hash table-based
	methods has been developed, including MAQ, which is accurate, feature
	rich and fast enough to align short reads from a single individual.
	However, MAQ does not support gapped alignment for single-end reads,
	which makes it unsuitable for alignment of longer reads where indels
	may occur frequently. The speed of MAQ is also a concern when the
	alignment is scaled up to the resequencing of hundreds of individuals.
	RESULTS: We implemented Burrows-Wheeler Alignment tool (BWA), a new
	read alignment package that is based on backward search with Burrows-Wheeler
	Transform (BWT), to efficiently align short sequencing reads against
	a large reference sequence such as the human genome, allowing mismatches
	and gaps. BWA supports both base space reads, e.g. from Illumina
	sequencing machines, and color space reads from AB SOLiD machines.
	Evaluations on both simulated and real data suggest that BWA is approximately
	10-20x faster than MAQ, while achieving similar accuracy. In addition,
	BWA outputs alignment in the new standard SAM (Sequence Alignment/Map)
	format. Variant calling and other downstream analyses after the alignment
	can be achieved with the open source SAMtools software package. AVAILABILITY:
	http://maq.sourceforge.net.},
  
  file = {main:Li2009a.pdf:PDF},
  institution = {Wellcome Trust Sanger Institute, Wellcome Trust Genome Campus, Cambridge,
	CB10 1SA, UK.},
  keywords = {Algorithms; Genomics; Sequence Alignment; Sequence Analysis, DNA;
	Software},
  owner = {calkan},
  pii = {btp324},
  pmid = {19451168},
  timestamp = {2010.02.03},
}

@ARTICLE{Li2009b,
  author = {Heng Li and Bob Handsaker and Alec Wysoker and Tim Fennell and Jue
	Ruan and Nils Homer and Gabor Marth and Goncalo Abecasis and Richard
	Durbin and 1000 Genome Project Data Processing Subgroup},
  title = {The Sequence Alignment/Map format and {SAMtools}.},
  journal = {Bioinformatics},
  year = {2009},
  volume = {25},
  pages = {2078--2079},
  month = {Aug},
  abstract = {SUMMARY: The Sequence Alignment/Map (SAM) format is a generic alignment
	format for storing read alignments against reference sequences, supporting
	short and long reads (up to 128 Mbp) produced by different sequencing
	platforms. It is flexible in style, compact in size, efficient in
	random access and is the format in which alignments from the 1000
	Genomes Project are released. SAMtools implements various utilities
	for post-processing alignments in the SAM format, such as indexing,
	variant caller and alignment viewer, and thus provides universal
	tools for processing read alignments. AVAILABILITY: http://samtools.sourceforge.net.},
  file = {main:Li2009b.pdf:PDF},
  institution = {Wellcome Trust Sanger Institute, Wellcome Trust Genome Campus, Cambridge,
	CB10 1SA, UK, Broad Institute of MIT and Harvard, Cambridge, MA 02141,
	USA.},
  keywords = {Algorithms; Base Sequence; Computational Biology, methods; Genome;
	Genomics; Molecular Sequence Data; Sequence Alignment, methods; Sequence
	Analysis, DNA, methods; Software},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pmid = {19505943},
  timestamp = {2011.07.20},
}

@ARTICLE{Li2008,
  author = {Heng Li and Jue Ruan and Richard Durbin},
  title = {Mapping short {DNA} sequencing reads and calling variants using mapping
	quality scores.},
  journal = {Genome Res},
  year = {2008},
  volume = {18},
  pages = {1851--1858},
  month = {Nov},
  abstract = {New sequencing technologies promise a new era in the use of DNA sequence.
	However, some of these technologies produce very short reads, typically
	of a few tens of base pairs, and to use these reads effectively requires
	new algorithms and software. In particular, there is a major issue
	in efficiently aligning short reads to a reference genome and handling
	ambiguity or lack of accuracy in this alignment. Here we introduce
	the concept of mapping quality, a measure of the confidence that
	a read actually comes from the position it is aligned to by the mapping
	algorithm. We describe the software MAQ that can build assemblies
	by mapping shotgun short reads to a reference genome, using quality
	scores to derive genotype calls of the consensus sequence of a diploid
	genome, e.g., from a human sample. MAQ makes full use of mate-pair
	information and estimates the error probability of each read alignment.
	Error probabilities are also derived for the final genotype calls,
	using a Bayesian statistical model that incorporates the mapping
	qualities, error probabilities from the raw sequence quality scores,
	sampling of the two haplotypes, and an empirical model for correlated
	errors at a site. Both read mapping and genotype calling are evaluated
	on simulated data and real data. MAQ is accurate, efficient, versatile,
	and user-friendly. It is freely available at http://maq.sourceforge.net.},
  
  file = {main:Li2008.pdf:PDF;supp:Li2008-supp.pdf:PDF},
  institution = {The Wellcome Trust Sanger Institute, Hinxton CB10 1SA, United Kingdom.},
  keywords = {Algorithms; Bayes Theorem; Chromosome Mapping; Computer Simulation;
	DNA; DNA, Bacterial; Diploidy; Genome, Bacterial; Genome, Human;
	Humans; Polymorphism, Single Nucleotide; Reproducibility of Results;
	Salmonella paratyphi A; Sequence Alignment; Sequence Analysis, DNA;
	Software},
  owner = {calkan},
  pii = {gr.078212.108},
  pmid = {18714091},
  timestamp = {2009.01.12},
}

@ARTICLE{Li2009,
  author = {Ruiqiang Li and Yingrui Li and Xiaodong Fang and Huanming Yang and
	Jian Wang and Karsten Kristiansen and Jun Wang},
  title = {{SNP} detection for massively parallel whole-genome resequencing.},
  journal = {Genome Res},
  year = {2009},
  volume = {19},
  pages = {1124--1132},
  month = {Jun},
  abstract = {Next-generation massively parallel sequencing technologies provide
	ultrahigh throughput at two orders of magnitude lower unit cost than
	capillary Sanger sequencing technology. One of the key applications
	of next-generation sequencing is studying genetic variation between
	individuals using whole-genome or target region resequencing. Here,
	we have developed a consensus-calling and SNP-detection method for
	sequencing-by-synthesis Illumina Genome Analyzer technology. We designed
	this method by carefully considering the data quality, alignment,
	and experimental errors common to this technology. All of this information
	was integrated into a single quality score for each base under Bayesian
	theory to measure the accuracy of consensus calling. We tested this
	methodology using a large-scale human resequencing data set of 36x
	coverage and assembled a high-quality nonrepetitive consensus sequence
	for 92.25\% of the diploid autosomes and 88.07\% of the haploid X
	chromosome. Comparison of the consensus sequence with Illumina human
	1M BeadChip genotyped alleles from the same DNA sample showed that
	98.6\% of the 37,933 genotyped alleles on the X chromosome and 98\%
	of 999,981 genotyped alleles on autosomes were covered at 99.97\%
	and 99.84\% consistency, respectively. At a low sequencing depth,
	we used prior probability of dbSNP alleles and were able to improve
	coverage of the dbSNP sites significantly as compared to that obtained
	using a nonimputation model. Our analyses demonstrate that our method
	has a very low false call rate at any sequencing depth and excellent
	genome coverage at a high sequencing depth.},
  
  file = {main:Li2009.pdf:PDF},
  institution = {Beijing Genomics Institute at Shenzhen, Shenzhen 518000, China},
  keywords = {Algorithms; Asian Continental Ancestry Group; Chromosomes, Human,
	X; Computational Biology; Genetics, Population; Genome, Human; Genotype;
	Humans; Likelihood Functions; Models, Genetic; Models, Statistical;
	Polymorphism, Single Nucleotide; Probability; Reproducibility of
	Results; Sequence Analysis, DNA; Software; SOAPsnp},
  owner = {calkan},
  pii = {gr.088013.108},
  pmid = {19420381},
  timestamp = {2009.09.22},
}

@ARTICLE{Li2008a,
  author = {Ruiqiang Li and Yingrui Li and Karsten Kristiansen and Jun Wang},
  title = {{SOAP:} short oligonucleotide alignment program.},
  journal = {Bioinformatics},
  year = {2008},
  volume = {24},
  pages = {713--714},
  month = {Mar},
  abstract = {SUMMARY: We have developed a program SOAP for efficient gapped and
	ungapped alignment of short oligonucleotides onto reference sequences.
	The program is designed to handle the huge amounts of short reads
	generated by parallel sequencing using the new generation Illumina-Solexa
	sequencing technology. SOAP is compatible with numerous applications,
	including single-read or pair-end resequencing, small RNA discovery
	and mRNA tag sequence mapping. SOAP is a command-driven program,
	which supports multi-threaded parallel computing, and has a batch
	module for multiple query sets. AVAILABILITY: http://soap.genomics.org.cn.},
  
  file = {main:Li2008a.pdf:PDF},
  institution = {Beijing Genomics Institute at Shenzhen, Shenzhen 518083, China.},
  keywords = {Sequence Alignment; Sequence Analysis, DNA; Sequence Homology, Amino
	Acid},
  owner = {calkan},
  pii = {btn025},
  pmid = {18227114},
  timestamp = {2009.04.07},
}

@ARTICLE{Lin2008,
  author = {Hao Lin and Zefeng Zhang and Michael Q Zhang and Bin Ma and Ming
	Li},
  title = {{ZOOM}! Zillions of oligos mapped.},
  journal = {Bioinformatics},
  year = {2008},
  volume = {24},
  pages = {2431--2437},
  month = {Nov},
  abstract = {MOTIVATION: The next generation sequencing technologies are generating
	billions of short reads daily. Resequencing and personalized medicine
	need much faster software to map these deep sequencing reads to a
	reference genome, to identify SNPs or rare transcripts. RESULTS:
	We present a framework for how full sensitivity mapping can be done
	in the most efficient way, via spaced seeds. Using the framework,
	we have developed software called ZOOM, which is able to map the
	Illumina/Solexa reads of 15x coverage of a human genome to the reference
	human genome in one CPU-day, allowing two mismatches, at full sensitivity.
	AVAILABILITY: ZOOM is freely available to non-commercial users at
	http://www.bioinfor.com/zoom},
  
  file = {main:Lin2008.pdf:PDF},
  institution = {Institute for Computing Technology, Chinese Academy of Sciences,
	Beijing, China.},
  keywords = {Algorithms; Contig Mapping; Genome, Human; Humans; Oligonucleotides;
	Sequence Alignment; Sequence Analysis, DNA; Software; Software Validation},
  owner = {calkan},
  pii = {btn416},
  pmid = {18684737},
  timestamp = {2009.04.07},
}

@ARTICLE{SIMT_WARP,
  author = {Erik Lindholm and John Nickolls and Stuart Oberman and John Montrym},
  title = {NVIDIA Tesla: a Unified Graphics and Computing Architecture},
  journal = {Micro, IEEE},
  year = {2008},
  volume = {28},
  pages = {39--55},
}

@ARTICLE{Liu2003a,
  author = {Ge Liu and N. I. S. C. Comparative Sequencing Program and Shaying
	Zhao and Jeffrey A Bailey and S. Cenk Sahinalp and Can Alkan and
	Eray Tuzun and Eric D Green and Evan E Eichler},
  title = {Analysis of primate genomic variation reveals a repeat-driven expansion
	of the human genome.},
  journal = {Genome Res},
  year = {2003},
  volume = {13},
  pages = {358--368},
  month = {Mar},
  abstract = {We performed a detailed analysis of both single-nucleotide and large
	insertion/deletion events based on large-scale comparison of 10.6
	Mb of genomic sequence from lemur, baboon, and chimpanzee to human.
	Using a human genomic reference, optimal global alignments were constructed
	from large (>50-kb) genomic sequence clones. These alignments were
	examined for the pattern, frequency, and nature of mutational events.
	Whereas rates of single-nucleotide substitution remain relatively
	constant (1-2 x 10(-9) substitutions/site/year), rates of retrotransposition
	vary radically among different primate lineages. These differences
	have lead to a 15\%-20\% expansion of human genome size over the
	last 50 million years of primate evolution, 90\% of it due to new
	retroposon insertions. Orthologous comparisons with the chimpanzee
	suggest that the human genome continues to significantly expand due
	to shifts in retrotransposition activity. Assuming that the primate
	genome sequence we have sampled is representative, we estimate that
	human euchromatin has expanded 30 Mb and 550 Mb compared to the primate
	genomes of chimpanzee and lemur, respectively.},
  
  file = {main:Liu2003a.pdf:PDF;supp:Liu2003a-supp.pdf:PDF},
  institution = {Department of Genetics, Case Western Reserve University School of
	Medicine and University Hospitals of Cleveland, Cleveland, Ohio 44106,
	USA.},
  keywords = {Animals; Evolution, Molecular; Genetic Variation; Genome; Genome,
	Human; Humans; Lemur; Papio; Point Mutation; Pongo pygmaeus; Primates;
	Retroelements; Sequence Alignment},
  owner = {calkan},
  pmid = {12618366},
  timestamp = {2009.09.18},
}

@ARTICLE{Liu2009,
  author = {George E Liu and Can Alkan and Lu Jiang and Shaying Zhao and Evan
	E Eichler},
  title = {Comparative analysis of {Alu} repeats in primate genomes.},
  journal = {Genome Res},
  year = {2009},
  volume = {19},
  pages = {876--885},
  month = {May},
  abstract = {Using bacteria artificial chromosome (BAC) end sequences (16.9 Mb)
	and high-quality alignments of genomic sequences (17.4 Mb), we performed
	a global assessment of the divergence distributions, phylogenies,
	and consensus sequences for Alu elements in primates including lemur,
	marmoset, macaque, baboon, and chimpanzee as compared to human. We
	found that in lemurs, Alu elements show a broader and more symmetric
	sequence divergence distribution, suggesting a steady rate of Alu
	retrotransposition activity among prosimians. In contrast, Alu elements
	in anthropoids show a skewed distribution shifted toward more ancient
	elements with continual declining rates in recent Alu activity along
	the hominoid lineage of evolution. Using an integrated approach combining
	mutation profile and insertion/deletion analyses, we identified nine
	novel lineage-specific Alu subfamilies in lemur (seven), marmoset
	(one), and baboon/macaque (one) containing multiple diagnostic mutations
	distinct from their human counterparts-Alu J, S, and Y subfamilies,
	respectively. Among these primates, we show that that the lemur has
	the lowest density of Alu repeats (55 repeats/Mb), while marmoset
	has the greatest abundance (188 repeats/Mb). We estimate that approximately
	70\% of lemur and 16\% of marmoset Alu elements belong to lineage-specific
	subfamilies. Our analysis has provided an evolutionary framework
	for further classification and refinement of the Alu repeat phylogeny.
	The differences in the distribution and rates of Alu activity have
	played an important role in subtly reshaping the structure of primate
	genomes. The functional consequences of these changes among the diverse
	primate lineages over such short periods of evolutionary time are
	an important area of future investigation.},
  
  file = {main:Liu2009.pdf:PDF;supp:Liu2009-supp.pdf:PDF},
  institution = {USDA, ARS, ANRI, Bovine Functional Genomics Laboratory, Beltsville,
	MD 20705, USA. george.liu@ars.usda.gov},
  owner = {calkan},
  pii = {19/5/876},
  pmid = {19411604},
  timestamp = {2009.09.18},
}

@ARTICLE{Locke2011,
  author = {Devin P Locke and LaDeana W Hillier and Wesley C Warren and Kim C
    Worley and Lynne V Nazareth and Donna M Muzny and Shiaw-Pyng Yang
    and Zhengyuan Wang and Asif T Chinwalla and Pat Minx and Makedonka
    Mitreva and Lisa Cook and Kim D Delehaunty and Catrina Fronick and
    Heather Schmidt and Lucinda A Fulton and Robert S Fulton and Joanne
    O Nelson and Vincent Magrini and Craig Pohl and Tina A Graves and
    Chris Markovic and Andy Cree and Huyen H Dinh and Jennifer Hume and
    Christie L Kovar and Gerald R Fowler and Gerton Lunter and Stephen
    Meader and Andreas Heger and others},
  title = {Comparative and demographic analysis of orang-utan genomes.},
  journal = {Nature},
  year = {2011},
  volume = {469},
  pages = {529--533},
  month = {Jan},
  abstract = {'Orang-utan' is derived from a Malay term meaning 'man of the forest'
	and aptly describes the southeast Asian great apes native to Sumatra
	and Borneo. The orang-utan species, Pongo abelii (Sumatran) and Pongo
	pygmaeus (Bornean), are the most phylogenetically distant great apes
	from humans, thereby providing an informative perspective on hominid
	evolution. Here we present a Sumatran orang-utan draft genome assembly
	and short read sequence data from five Sumatran and five Bornean
	orang-utan genomes. Our analyses reveal that, compared to other primates,
	the orang-utan genome has many unique features. Structural evolution
	of the orang-utan genome has proceeded much more slowly than other
	great apes, evidenced by fewer rearrangements, less segmental duplication,
	a lower rate of gene family turnover and surprisingly quiescent Alu
	repeats, which have played a major role in restructuring other primate
	genomes. We also describe a primate polymorphic neocentromere, found
	in both Pongo species, emphasizing the gradual evolution of orang-utan
	genome structure. Orang-utans have extremely low energy usage for
	a eutherian mammal, far lower than their hominid relatives. Adding
	their genome to the repertoire of sequenced primates illuminates
	new signals of positive selection in several pathways including glycolipid
	metabolism. From the population perspective, both Pongo species are
	deeply diverse; however, Sumatran individuals possess greater diversity
	than their Bornean counterparts, and more species-specific variation.
	Our estimate of Bornean/Sumatran speciation time, 400,000 years ago,
	is more recent than most previous studies and underscores the complexity
	of the orang-utan speciation process. Despite a smaller modern census
	population size, the Sumatran effective population size (N(e)) expanded
	exponentially relative to the ancestral N(e) after the split, while
	Bornean N(e) declined over the same period. Overall, the resources
	and analyses presented here offer new opportunities in evolutionary
	genomics, insights into hominid biology, and an extensive database
	of variation for conservation efforts.},
  
  file = {main:Locke2011.pdf:PDF;supp:Locke2011-supp.pdf:PDF},
  institution = {The Genome Center at Washington University, Washington University
	School of Medicine, 4444 Forest Park Avenue, Saint Louis, Missouri
	63108, USA. dlocke@wustl.edu},
  keywords = {Animals; Centromere, genetics; Cerebrosides, metabolism; Chromosomes;
	Evolution, Molecular; Female; Gene Rearrangement, genetics; Genetic
	Speciation; Genetic Variation; Genetics, Population; Genome, genetics;
	Humans; Male; Phylogeny; Pongo abelii, genetics; Pongo pygmaeus,
	genetics; Population Density; Population Dynamics; Species Specificity},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {nature09687},
  pmid = {21270892},
  timestamp = {2011.06.17},
}

@ARTICLE{MacArthur2012,
  author = {Daniel G MacArthur and Suganthi Balasubramanian and Adam Frankish
	and Ni Huang and James Morris and Klaudia Walter and Luke Jostins
	and Lukas Habegger and Joseph K Pickrell and Stephen B Montgomery
	and Cornelis A Albers and Zhengdong D Zhang and Donald F Conrad and
	Gerton Lunter and Hancheng Zheng and Qasim Ayub and Mark A DePristo
	and Eric Banks and Min Hu and Robert E Handsaker and Jeffrey A Rosenfeld
	and Menachem Fromer and Mike Jin and Xinmeng Jasmine Mu and Ekta
	Khurana and Kai Ye and Mike Kay and Gary Ian Saunders and Marie-Marthe
	Suner and Toby Hunt and If H A Barnes and Clara Amid and Denise R
	Carvalho-Silva and Alexandra H Bignell and Catherine Snow and Bryndis
	Yngvadottir and Suzannah Bumpstead and David N Cooper and Yali Xue
	and Irene Gallego Romero and 1000 Genomes Project Consortium and
	Jun Wang and Yingrui Li and Richard A Gibbs and Steven A McCarroll
	and Emmanouil T Dermitzakis and Jonathan K Pritchard and Jeffrey
	C Barrett and Jennifer Harrow and Matthew E Hurles and Mark B Gerstein
	and Chris Tyler-Smith},
  title = {A systematic survey of loss-of-function variants in human protein-coding
	genes.},
  journal = {Science},
  year = {2012},
  volume = {335},
  pages = {823--828},
  month = {Feb},
  abstract = {Genome-sequencing studies indicate that all humans carry many genetic
	variants predicted to cause loss of function (LoF) of protein-coding
	genes, suggesting unexpected redundancy in the human genome. Here
	we apply stringent filters to 2951 putative LoF variants obtained
	from 185 human genomes to determine their true prevalence and properties.
	We estimate that human genomes typically contain ~100 genuine LoF
	variants with ~20 genes completely inactivated. We identify rare
	and likely deleterious LoF alleles, including 26 known and 21 predicted
	severe disease-causing variants, as well as common LoF variants in
	nonessential genes. We describe functional and evolutionary differences
	between LoF-tolerant and recessive disease genes and a method for
	using these differences to prioritize candidate genes found in clinical
	sequencing studies.},
  
  file = {Published version:MacArthur2012.pdf:PDF},
  institution = {rvard.edu},
  keywords = {Disease, genetics; Gene Expression; Gene Frequency; Genetic Variation;
	Genome, Human; Humans; Phenotype; Polymorphism, Single Nucleotide;
	Proteins, genetics; Selection, Genetic},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {335/6070/823},
  pmid = {22344438},
  timestamp = {2012.03.16},
}

@ARTICLE{Manavski2008,
  author = {Svetlin A Manavski and Giorgio Valle},
  title = {{CUDA} compatible {GPU} cards as efficient hardware accelerators
	for Smith-Waterman sequence alignment.},
  journal = {BMC Bioinformatics},
  year = {2008},
  volume = {9 Suppl 2},
  pages = {S10},
  abstract = {BACKGROUND: Searching for similarities in protein and DNA databases
	has become a routine procedure in Molecular Biology. The Smith-Waterman
	algorithm has been available for more than 25 years. It is based
	on a dynamic programming approach that explores all the possible
	alignments between two sequences; as a result it returns the optimal
	local alignment. Unfortunately, the computational cost is very high,
	requiring a number of operations proportional to the product of the
	length of two sequences. Furthermore, the exponential growth of protein
	and DNA databases makes the Smith-Waterman algorithm unrealistic
	for searching similarities in large sets of sequences. For these
	reasons heuristic approaches such as those implemented in FASTA and
	BLAST tend to be preferred, allowing faster execution times at the
	cost of reduced sensitivity. The main motivation of our work is to
	exploit the huge computational power of commonly available graphic
	cards, to develop high performance solutions for sequence alignment.
	RESULTS: In this paper we present what we believe is the fastest
	solution of the exact Smith-Waterman algorithm running on commodity
	hardware. It is implemented in the recently released CUDA programming
	environment by NVidia. CUDA allows direct access to the hardware
	primitives of the last-generation Graphics Processing Units (GPU)
	G80. Speeds of more than 3.5 GCUPS (Giga Cell Updates Per Second)
	are achieved on a workstation running two GeForce 8800 GTX. Exhaustive
	tests have been done to compare our implementation to SSEARCH and
	BLAST, running on a 3 GHz Intel Pentium IV processor. Our solution
	was also compared to a recently published GPU implementation and
	to a Single Instruction Multiple Data (SIMD) solution. These tests
	show that our implementation performs from 2 to 30 times faster than
	any other previous attempt available on commodity hardware. CONCLUSIONS:
	The results show that graphic cards are now sufficiently advanced
	to be used as efficient hardware accelerators for sequence alignment.
	Their performance is better than any alternative available on commodity
	hardware platforms. The solution presented in this paper allows large
	scale alignments to be performed at low cost, using the exact Smith-Waterman
	algorithm instead of the largely adopted heuristic approaches.},
  
  file = {main:Manavski2008.pdf:PDF},
  institution = {CRIBI, University of Padova, Padova, Italy. svetlin.manavski@cribi.unipd.it},
  keywords = {Computer Graphics; Equipment Design; Equipment Failure Analysis; Information
	Storage and Retrieval; Sequence Alignment; Sequence Analysis; Signal
	Processing, Computer-Assisted},
  owner = {calkan},
  pii = {1471-2105-9-S2-S10},
  pmid = {18387198},
  timestamp = {2009.04.25},
}

@ARTICLE{Mardis2008,
  author = {Elaine R Mardis},
  title = {The impact of next-generation sequencing technology on genetics.},
  journal = {Trends Genet},
  year = {2008},
  volume = {24},
  pages = {133--141},
  month = {Mar},
  abstract = {If one accepts that the fundamental pursuit of genetics is to determine
	the genotypes that explain phenotypes, the meteoric increase of DNA
	sequence information applied toward that pursuit has nowhere to go
	but up. The recent introduction of instruments capable of producing
	millions of DNA sequence reads in a single run is rapidly changing
	the landscape of genetics, providing the ability to answer questions
	with heretofore unimaginable speed. These technologies will provide
	an inexpensive, genome-wide sequence readout as an endpoint to applications
	ranging from chromatin immunoprecipitation, mutation mapping and
	polymorphism discovery to noncoding RNA discovery. Here I survey
	next-generation sequencing technologies and consider how they can
	provide a more complete picture of how the genome shapes the organism.},
  
  owner = {calkan},
  pii = {S0168-9525(08)00023-1},
  pmid = {18262675},
  timestamp = {2008.03.05},
}

@ARTICLE{roche454,
  author = {Marcel Margulies and et al.},
  title = {Genome sequencing in microfabricated high-density picolitre reactors},
  year = {2005},
  institution = {Nature}
}

@ARTICLE{Marques-Bonet2009,
  author = {Tomas Marques-Bonet and Jeffrey M Kidd and Mario Ventura and Tina
    A Graves and Ze Cheng and LaDeana W Hillier and Zhaoshi Jiang and
    Carl Baker and Ray Malfavon-Borja and Lucinda A Fulton and Can Alkan
    and Gozde Aksay and Santhosh Girirajan and Priscillia Siswara and
    Lin Chen and Maria Francesca Cardone and Arcadi Navarro and Elaine
    R Mardis and Richard K Wilson and Evan E Eichler},
  title = {A burst of segmental duplications in the genome of the {African}
	great ape ancestor.},
  journal = {Nature},
  year = {2009},
  volume = {457},
  pages = {877--881},
  month = {Feb},
  abstract = {It is generally accepted that the extent of phenotypic change between
	human and great apes is dissonant with the rate of molecular change.
	Between these two groups, proteins are virtually identical, cytogenetically
	there are few rearrangements that distinguish ape-human chromosomes,
	and rates of single-base-pair change and retrotransposon activity
	have slowed particularly within hominid lineages when compared to
	rodents or monkeys. Studies of gene family evolution indicate that
	gene loss and gain are enriched within the primate lineage. Here,
	we perform a systematic analysis of duplication content of four primate
	genomes (macaque, orang-utan, chimpanzee and human) in an effort
	to understand the pattern and rates of genomic duplication during
	hominid evolution. We find that the ancestral branch leading to human
	and African great apes shows the most significant increase in duplication
	activity both in terms of base pairs and in terms of events. This
	duplication acceleration within the ancestral species is significant
	when compared to lineage-specific rate estimates even after accounting
	for copy-number polymorphism and homoplasy. We discover striking
	examples of recurrent and independent gene-containing duplications
	within the gorilla and chimpanzee that are absent in the human lineage.
	Our results suggest that the evolutionary properties of copy-number
	mutation differ significantly from other forms of genetic mutation
	and, in contrast to the hominid slowdown of single-base-pair mutations,
	there has been a genomic burst of duplication activity at this period
	during human evolution.},
  
  file = {main:Marques-Bonet2009.pdf:PDF;supp_info:Marques-Bonet2009-supp.pdf:PDF;supp_tables:Marques-Bonet2009-supptable.xls:Excel;supp_figures:Marques-Bonet2009-suppfigs.pdf:PDF},
  institution = {Department of Genome Sciences, University of Washington and the Howard
	Hughes Medical Institute, Seattle, Washington 98195, USA.},
  keywords = {Africa; Animals; Catarrhini; Chromosome Mapping; Evolution, Molecular;
	Gene Duplication; Genome; Humans; Polymorphism, Genetic; Reproducibility
	of Results},
  owner = {calkan},
  pii = {nature07744},
  pmid = {19212409},
  timestamp = {2009.09.18},
}

@ARTICLE{Marth1999,
  author = {G. T. Marth and I. Korf and M. D. Yandell and R. T. Yeh and Z. Gu
	and H. Zakeri and N. O. Stitziel and L. Hillier and P. Y. Kwok and
	W. R. Gish},
  title = {A general approach to single-nucleotide polymorphism discovery.},
  journal = {Nat Genet},
  year = {1999},
  volume = {23},
  pages = {452--456},
  month = {Dec},
  abstract = {Single-nucleotide polymorphisms (SNPs) are the most abundant form
	of human genetic variation and a resource for mapping complex genetic
	traits. The large volume of data produced by high-throughput sequencing
	projects is a rich and largely untapped source of SNPs (refs 2, 3,
	4, 5). We present here a unified approach to the discovery of variations
	in genetic sequence data of arbitrary DNA sources. We propose to
	use the rapidly emerging genomic sequence as a template on which
	to layer often unmapped, fragmentary sequence data and to use base
	quality values to discern true allelic variations from sequencing
	errors. By taking advantage of the genomic sequence we are able to
	use simpler yet more accurate methods for sequence organization:
	fragment clustering, paralogue identification and multiple alignment.
	We analyse these sequences with a novel, Bayesian inference engine,
	POLYBAYES, to calculate the probability that a given site is polymorphic.
	Rigorous treatment of base quality permits completely automated evaluation
	of the full length of all sequences, without limitations on alignment
	depth. We demonstrate this approach by accurate SNP predictions in
	human ESTs aligned to finished and working-draft quality genomic
	sequences, a data set representative of the typical challenges of
	sequence-based SNP discovery.},
  
  institution = {Washington University Department of Genetics and Genome Sequencing
	Center, St. Louis, Missouri, USA. gmarth@watson.wustl.edu},
  keywords = {Algorithms; Alleles; Bayes Theorem; Data Interpretation, Statistical;
	Expressed Sequence Tags; Genetic Techniques; Genome, Human; Humans;
	Polymorphism, Single Nucleotide; Sequence Alignment; Software; Variation
	(Genetics)},
  owner = {calkan},
  pmid = {10581034},
  timestamp = {2008.10.01},
}

@ARTICLE{McKernan2009,
  author = {Kevin Judd McKernan and Heather E Peckham and Gina L Costa and Stephen
	F McLaughlin and Yutao Fu and Eric F Tsung and Christopher R Clouser
	and Cisyla Duncan and Jeffrey K Ichikawa and Clarence C Lee and Zheng
	Zhang and Swati S Ranade and Eileen T Dimalanta and Fiona C Hyland
	and Tanya D Sokolsky and Lei Zhang and Andrew Sheridan and Haoning
	Fu and Cynthia L Hendrickson and Bin Li and Lev Kotler and Jeremy
	R Stuart and Joel A Malek and Jonathan M Manning and Alena A Antipova
	and Damon S Perez and Michael P Moore and Kathleen C Hayashibara
	and Michael R Lyons and Robert E Beaudoin and Brittany E Coleman
	and Michael W Laptewicz and Adam E Sannicandro and Michael D Rhodes
	and Rajesh K Gottimukkala and Shan Yang and Vineet Bafna and Ali
	Bashir and Andrew MacBride and Can Alkan and Jeffrey M Kidd and Evan
	E Eichler and Martin G Reese and Francisco M De La Vega and Alan
	P Blanchard},
  title = {Sequence and structural variation in a human genome uncovered by
	short-read, massively parallel ligation sequencing using two-base
	encoding.},
  journal = {Genome Res},
  year = {2009},
  volume = {19},
  pages = {1527--1541},
  month = {Sep},
  abstract = {We describe the genome sequencing of an anonymous individual of African
	origin using a novel ligation-based sequencing assay that enables
	a unique form of error correction that improves the raw accuracy
	of the aligned reads to >99.9\%, allowing us to accurately call SNPs
	with as few as two reads per allele. We collected several billion
	mate-paired reads yielding approximately 18x haploid coverage of
	aligned sequence and close to 300x clone coverage. Over 98\% of the
	reference genome is covered with at least one uniquely placed read,
	and 99.65\% is spanned by at least one uniquely placed mate-paired
	clone. We identify over 3.8 million SNPs, 19\% of which are novel.
	Mate-paired data are used to physically resolve haplotype phases
	of nearly two-thirds of the genotypes obtained and produce phased
	segments of up to 215 kb. We detect 226,529 intra-read indels, 5590
	indels between mate-paired reads, 91 inversions, and four gene fusions.
	We use a novel approach for detecting indels between mate-paired
	reads that are smaller than the standard deviation of the insert
	size of the library and discover deletions in common with those detected
	with our intra-read approach. Dozens of mutations previously described
	in OMIM and hundreds of nonsynonymous single-nucleotide and structural
	variants in genes previously implicated in disease are identified
	in this individual. There is more genetic variation in the human
	genome still to be uncovered, and we provide guidance for future
	surveys in populations and cancer biopsies.},
  
  file = {main:McKernan2009.pdf:PDF;supp:McKernan2009-supp.doc:Word;tableS4:McKernan2009-tableS4.xls:Excel;tableS7:McKernan2009-tableS7.xls:Excel},
  institution = {Life Technologies, Beverly, Massachusetts 01915, USA. Kevin.McKernan@appliedbiosystems.com},
  owner = {calkan},
  pii = {gr.091868.109},
  pmid = {19546169},
  timestamp = {2009.09.18},
}

@ARTICLE{Mefford2009,
  author = {Heather C Mefford and Gregory M Cooper and Troy Zerr and Joshua D
	Smith and Carl Baker and Neil Shafer and Erik C Thorland and Cindy
	Skinner and Charles E Schwartz and Deborah A Nickerson and Evan E
	Eichler},
  title = {A method for rapid, targeted {CNV} genotyping identifies rare variants
	associated with neurocognitive disease.},
  journal = {Genome Res},
  year = {2009},
  volume = {19},
  pages = {1579--1585},
  month = {Sep},
  abstract = {Copy-number variants (CNVs) are substantial contributors to human
	disease. A central challenge in CNV-disease association studies is
	to characterize the pathogenicity of rare and possibly incompletely
	penetrant events, which requires the accurate detection of rare CNVs
	in large numbers of individuals. Cost and throughput issues limit
	our ability to perform these studies. We have adapted the Illumina
	BeadXpress SNP genotyping assay and developed an algorithm, SNP-Conditional
	OUTlier detection (SCOUT), to rapidly and accurately detect both
	rare and common CNVs in large cohorts. This approach is customizable,
	cost effective, highly parallelized, and largely automated. We applied
	this method to screen 69 loci in 1105 children with unexplained intellectual
	disability, identifying pathogenic variants in 3.1\% of these individuals
	and potentially pathogenic variants in an additional 2.3\%. We identified
	seven individuals (0.7\%) with a deletion of 16p11.2, which has been
	previously associated with autism. Our results widen the phenotypic
	spectrum of these deletions to include intellectual disability without
	autism. We also detected 1.65-3.4 Mbp duplications at 16p13.11 in
	1.1\% of affected individuals and 350 kbp deletions at 15q11.2, near
	the Prader-Willi/Angelman syndrome critical region, in 0.8\% of affected
	individuals. Compared to published CNVs in controls they are significantly
	(P = 4.7 x 10(-5) and 0.003, respectively) enriched in these children,
	supporting previously published hypotheses that they are neurocognitive
	disease risk factors. More generally, this approach offers a previously
	unavailable balance between customization, cost, and throughput for
	analysis of CNVs and should prove valuable for targeted CNV detection
	in both research and diagnostic settings.},
  
  institution = {Department of Pediatrics, University of Washington, Seattle, Washington
	98195, USA.},
  owner = {calkan},
  pii = {gr.094987.109},
  pmid = {19506092},
  timestamp = {2009.09.19},
}

@ARTICLE{Mills2007,
  author = {Ryan E Mills and E. Andrew Bennett and Rebecca C Iskow and Scott
	E Devine},
  title = {Which transposable elements are active in the human genome?},
  journal = {Trends Genet},
  year = {2007},
  volume = {23},
  pages = {183--191},
  month = {Apr},
  abstract = {Although a large proportion (44\%) of the human genome is occupied
	by transposons and transposon-like repetitive elements, only a small
	proportion (<0.05\%) of these elements remain active today. Recent
	evidence indicates that approximately 35-40 subfamilies of Alu, L1
	and SVA elements (and possibly HERV-K elements) remain actively mobile
	in the human genome. These active transposons are of great interest
	because they continue to produce genetic diversity in human populations
	and also cause human diseases by integrating into genes. In this
	review, we examine these active human transposons and explore mechanistic
	factors that influence their mobilization.},
  
  institution = {Department of Biochemistry, Emory University School of Medicine,
	Atlanta, GA 30322, USA.},
  keywords = {Base Pairing; Base Sequence; DNA Transposable Elements; Genome, Human;
	Humans; Molecular Sequence Data; Nucleic Acid Conformation; Repetitive
	Sequences, Nucleic Acid},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {S0168-9525(07)00059-5},
  pmid = {17331616},
  timestamp = {2011.07.20},
}

@ARTICLE{Mills2006,
  author = {Ryan E Mills and Christopher T Luttig and Christine E Larkins and
	Adam Beauchamp and Circe Tsui and W. Stephen Pittard and Scott E
	Devine},
  title = {An initial map of insertion and deletion ({INDEL}) variation in the
	human genome.},
  journal = {Genome Res},
  year = {2006},
  volume = {16},
  pages = {1182--1190},
  month = {Sep},
  abstract = {Although many studies have been conducted to identify single nucleotide
	polymorphisms (SNPs) in humans, few studies have been conducted to
	identify alternative forms of natural genetic variation, such as
	insertion and deletion (INDEL) polymorphisms. In this report, we
	describe an initial map of human INDEL variation that contains 415,436
	unique INDEL polymorphisms. These INDELs were identified with a computational
	approach using DNA re-sequencing traces that originally were generated
	for SNP discovery projects. They range from 1 bp to 9989 bp in length
	and are split almost equally between insertions and deletions, relative
	to the chimpanzee genome sequence. Five major classes of INDELs were
	identified, including (1) insertions and deletions of single-base
	pairs, (2) monomeric base pair expansions, (3) multi-base pair expansions
	of 2-15 bp repeat units, (4) transposon insertions, and (5) INDELs
	containing random DNA sequences. Our INDELs are distributed throughout
	the human genome with an average density of one INDEL per 7.2 kb
	of DNA. Variation hotspots were identified with up to 48-fold regional
	increases in INDEL and/or SNP variation compared with the chromosomal
	averages for the same chromosomes. Over 148,000 INDELs (35.7\%) were
	identified within known genes, and 5542 of these INDELs were located
	in the promoters and exons of genes, where gene function would be
	expected to be influenced the greatest. All INDELs in this study
	have been deposited into dbSNP and have been integrated into maps
	of human genetic variation that are available to the research community.},
  
  file = {main:Mills2006.pdf:PDF},
  keywords = {Animals; Computational Biology; Genome, Human; Humans; Pan troglodytes;
	Polymorphism, Genetic; Polymorphism, Single Nucleotide; Sequence
	Deletion},
  owner = {calkan},
  pii = {gr.4565806},
  pmid = {16902084},
  timestamp = {2007.05.11},
}

@ARTICLE{Mills2011a,
  author = {Ryan E Mills and W. Stephen Pittard and Julienne M Mullaney and Umar
	Farooq and Todd H Creasy and Anup A Mahurkar and David M Kemeza and
	Daniel S Strassler and Chris P Ponting and Caleb Webber and Scott
	E Devine},
  title = {Natural genetic variation caused by small insertions and deletions
	in the human genome.},
  journal = {Genome Res},
  year = {2011},
  volume = {21},
  pages = {830--839},
  month = {Jun},
  abstract = {Human genetic variation is expected to play a central role in personalized
	medicine. Yet only a fraction of the natural genetic variation that
	is harbored by humans has been discovered to date. Here we report
	almost 2 million small insertions and deletions (INDELs) that range
	from 1 bp to 10,000 bp in length in the genomes of 79 diverse humans.
	These variants include 819,363 small INDELs that map to human genes.
	Small INDELs frequently were found in the coding exons of these genes,
	and several lines of evidence indicate that such variation is a major
	determinant of human biological diversity. Microarray-based genotyping
	experiments revealed several interesting observations regarding the
	population genetics of small INDEL variation. For example, we found
	that many of our INDELs had high levels of linkage disequilibrium
	(LD) with both HapMap SNPs and with high-scoring SNPs from genome-wide
	association studies. Overall, our study indicates that small INDEL
	variation is likely to be a key factor underlying inherited traits
	and diseases in humans.},
  
  file = {main:Mills2011a.pdf:PDF},
  institution = {Department of Biochemistry, Emory University School of Medicine,
	Atlanta, Georgia 30322, USA;},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {gr.115907.110},
  pmid = {21460062},
  timestamp = {2011.07.20},
}

@ARTICLE{mills2011nature1000genomes,
  author = {Ryan E Mills and Klaudia Walter and Chip Stewart and Robert E.
    Handsaker and Ken Chen and Can Alkan and Alexej Abyzov and Seungtai Chris Yoon
    and Kai Ye and R. Keira Cheetham and Asif Chinwalla and Donald F. Conrad and
    Yutao Fu and Fabian Grubert and Iman Hajirasouliha and Fereydoun Hormozdiari
    and Lilia M. Iakoucheva and Zamin Iqbal and Shuli Kang and Jeffrey M. Kidd and
    Miriam K. Konkel and Joshua Korn and Ekta Khurana and Deniz Kural and Hugo Y.
    K. Lam and Jing Leng and Ruiqiang Li and Yingrui Li and Chang-Yun Lin and
    Ruibang Luo and others},
  title = {Mapping copy number variation by population-scale genome sequencing.},
  journal = {Nature},
  year = {2011},
  volume = {470},
  pages = {59--65},
  month = {Feb},
  abstract = {Genomic structural variants (SVs) are abundant in humans, differing
	from other forms of variation in extent, origin and functional impact.
	Despite progress in SV characterization, the nucleotide resolution
	architecture of most SVs remains unknown. We constructed a map of
	unbalanced SVs (that is, copy number variants) based on whole genome
	DNA sequencing data from 185 human genomes, integrating evidence
	from complementary SV discovery approaches with extensive experimental
	validations. Our map encompassed 22,025 deletions and 6,000 additional
	SVs, including insertions and tandem duplications. Most SVs (53\%)
	were mapped to nucleotide resolution, which facilitated analysing
	their origin and functional impact. We examined numerous whole and
	partial gene deletions with a genotyping approach and observed a
	depletion of gene disruptions amongst high frequency deletions. Furthermore,
	we observed differences in the size spectra of SVs originating from
	distinct formation mechanisms, and constructed a map of SV hotspots
	formed by common mechanisms. Our analytical framework and SV map
	serves as a resource for sequencing-based association studies.},
  
  file = {main:Mills2011.pdf:PDF;supp_info:Mills2011-suppinfo.pdf:PDF;supp_methods:Mills2011-suppmethods.pdf:PDF},
  institution = {Department of Pathology, Brigham and Women's Hospital, Harvard Medical
	School, Boston, Massachusetts, USA.},
  keywords = {DNA Copy Number Variations, genetics; Gene Duplication, genetics;
	Genetic Predisposition to Disease, genetics; Genetics, Population;
	Genome, Human, genetics; Genomics; Genotype; Humans; Mutagenesis,
	Insertional, genetics; Reproducibility of Results; Sequence Analysis,
	DNA; Sequence Deletion, genetics},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {nature09708},
  pmid = {21293372},
  timestamp = {2011.06.17},
}

@ARTICLE{needleman,
  author = {Saul B. Needleman and Christian D. Wunsch},
  title = {A general method applicable to the search for similarities in the
	amino acid sequence of two proteins},
  journal = {Journal of Molecular Biology},
  year = {1970}
}

@ARTICLE{nw,
  author = {Saul B. Needleman and Christian D. Wunsch},
  title = {A general method applicable to the search for similarities in the
	amino acid sequence of two proteins},
  journal = {Journal of Molecular Biology},
  year = {1970},
  volume = {48},
  pages = {443--453},
}

@ARTICLE{Nguyen2011,
  author = {Di Kim Nguyen and Fan Yang and Rajinder Kaul and Can Alkan and Anthony
	Antonellis and Karen F Friery and Baoli Zhu and Pieter J de Jong
	and Christine M Disteche},
  title = {Clcn4-2 genomic structure differs between the {X} locus in {Mus}
	spretus and the autosomal locus in {Mus} musculus: {AT} motif enrichment
	on the {X}.},
  journal = {Genome Res},
  year = {2011},
  volume = {21},
  pages = {402--409},
  month = {Mar},
  abstract = {In Mus spretus, the chloride channel 4 gene Clcn4-2 is X-linked and
	dosage compensated by X up-regulation and X inactivation, while in
	the closely related mouse species Mus musculus, Clcn4-2 has been
	translocated to chromosome 7. We sequenced Clcn4-2 in M. spretus
	and identified the breakpoints of the evolutionary translocation
	in the Mus lineage. Genetic and epigenetic differences were observed
	between the 5'ends of the autosomal and X-linked loci. Remarkably,
	Clcn4-2 introns have been truncated on chromosome 7 in M. musculus
	as compared with the X-linked loci from seven other eutherian mammals.
	Intron sequences specifically preserved in the X-linked loci were
	significantly enriched in AT-rich oligomers. Genome-wide analyses
	showed an overall enrichment in AT motifs unique to the eutherian
	X (except for genes that escape X inactivation), suggesting a role
	for these motifs in regulation of the X chromosome.},
  
  file = {main:Nguyen2011.pdf:PDF},
  institution = {Department of Pathology, University of Washington, Seattle, Washington
	98195, USA.},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {gr.108563.110},
  pmid = {21282478},
  timestamp = {2011.06.17},
}

@ARTICLE{Ning2001,
  author = {Z. Ning and A. J. Cox and J. C. Mullikin},
  title = {{SSAHA:} a fast search method for large {DNA} databases.},
  journal = {Genome Res},
  year = {2001},
  volume = {11},
  pages = {1725--1729},
  month = {Oct},
  abstract = {We describe an algorithm, SSAHA (Sequence Search and Alignment by
	Hashing Algorithm), for performing fast searches on databases containing
	multiple gigabases of DNA. Sequences in the database are preprocessed
	by breaking them into consecutive k-tuples of k contiguous bases
	and then using a hash table to store the position of each occurrence
	of each k-tuple. Searching for a query sequence in the database is
	done by obtaining from the hash table the "hits" for each k-tuple
	in the query sequence and then performing a sort on the results.
	We discuss the effect of the tuple length k on the search speed,
	memory usage, and sensitivity of the algorithm and present the results
	of computational experiments which show that SSAHA can be three to
	four orders of magnitude faster than BLAST or FASTA, while requiring
	less memory than suffix tree methods. The SSAHA algorithm is used
	for high-throughput single nucleotide polymorphism (SNP) detection
	and very large scale sequence assembly. Also, it provides Web-based
	sequence search facilities for Ensembl projects.},
  
  keywords = {Algorithms; Base Composition; Base Sequence; DNA; Database Management
	Systems; Databases, Factual; Sensitivity and Specificity; Sequence
	Alignment; Software},
  owner = {calkan},
  pmid = {11591649},
  timestamp = {2008.03.04},
}

@ARTICLE{Peiffer2006,
  author = {Daniel A Peiffer and Jennie M Le and Frank J Steemers and Weihua
	Chang and Tony Jenniges and Francisco Garcia and Kirt Haden and Jiangzhen
	Li and Chad A Shaw and John Belmont and Sau Wai Cheung and Richard
	M Shen and David L Barker and Kevin L Gunderson},
  title = {High-resolution genomic profiling of chromosomal aberrations using
	Infinium whole-genome genotyping.},
  journal = {Genome Res},
  year = {2006},
  volume = {16},
  pages = {1136--1148},
  month = {Sep},
  abstract = {Array-CGH is a powerful tool for the detection of chromosomal aberrations.
	The introduction of high-density SNP genotyping technology to genomic
	profiling, termed SNP-CGH, represents a further advance, since simultaneous
	measurement of both signal intensity variations and changes in allelic
	composition makes it possible to detect both copy number changes
	and copy-neutral loss-of-heterozygosity (LOH) events. We demonstrate
	the utility of SNP-CGH with two Infinium whole-genome genotyping
	BeadChips, assaying 109,000 and 317,000 SNP loci, to detect chromosomal
	aberrations in samples bearing constitutional aberrations as well
	tumor samples at sub-100 kb effective resolution. Detected aberrations
	include homozygous deletions, hemizygous deletions, copy-neutral
	LOH, duplications, and amplifications. The statistical ability to
	detect common aberrations was modeled by analysis of an X chromosome
	titration model system, and sensitivity was modeled by titration
	of gDNA from a tumor cell with that of its paired normal cell line.
	Analysis was facilitated by using a genome browser that plots log
	ratios of normalized intensities and allelic ratios along the chromosomes.
	We developed two modes of SNP-CGH analysis, a single sample and a
	paired sample mode. The single sample mode computes log intensity
	ratios and allelic ratios by referencing to canonical genotype clusters
	generated from approximately 120 reference samples, whereas the paired
	sample mode uses a paired normal reference sample from the same individual.
	Finally, the two analysis modes are compared and contrasted for their
	utility in analyzing different types of input gDNA: low input amounts,
	fragmented gDNA, and Phi29 whole-genome pre-amplified DNA.},
  
  institution = {Illumina, Inc., San Diego, California 92121, USA.},
  keywords = {Cell Line, Tumor; Chromosome Aberrations; Chromosomes, Human; DNA;
	Female; Genome, Human; Genomics; Genotype; Humans; In Situ Hybridization,
	Fluorescence; Loss of Heterozygosity; Male; Oligonucleotide Array
	Sequence Analysis; Polymorphism, Single Nucleotide},
  owner = {calkan},
  pii = {gr.5402306},
  pmid = {16899659},
  timestamp = {2009.09.21},
}

@ARTICLE{Pop2008,
  author = {Mihai Pop and Steven L Salzberg},
  title = {Bioinformatics challenges of new sequencing technology.},
  journal = {Trends Genet},
  year = {2008},
  volume = {24},
  pages = {142--149},
  month = {Mar},
  abstract = {New DNA sequencing technologies can sequence up to one billion bases
	in a single day at low cost, putting large-scale sequencing within
	the reach of many scientists. Many researchers are forging ahead
	with projects to sequence a range of species using the new technologies.
	However, these new technologies produce read lengths as short as
	35-40 nucleotides, posing challenges for genome assembly and annotation.
	Here we review the challenges and describe some of the bioinformatics
	systems that are being proposed to solve them. We specifically address
	issues arising from using these technologies in assembly projects,
	both de novo and for resequencing purposes, as well as efforts to
	improve genome annotation in the fragmented assemblies produced by
	short read lengths.},
  
  owner = {calkan},
  pii = {S0168-9525(08)00022-X},
  pmid = {18262676},
  timestamp = {2008.03.05},
}

@ARTICLE{Prufer2012_bonobopaper, 
  author = {Kay Prüfer and Kasper Munch and Ines
    Hellmann and Keiko Akagi and Jason R. Miller and Brian Walenz and Sergey Koren
    and Granger Sutton and Chinnappa Kodira and Roger Winer and James R. Knight and
    James C. Mullikin and Stephen J. Meader and Chris P. Ponting and Gerton Lunter
    and Saneyuki Higashino and Asger Hobolth and Julien Dutheil and Emre Karakoç
    and Can Alkan and Saba Sajjadian and Claudia Rita Catacchio and Mario Ventura
    and Tomas Marques-Bonet and Evan E. Eichler and Claudine André and Rebeca
    Atencia and Lawrence Mugisha and Jörg Junhold and Nick Patterson and others},
  title = {The bonobo genome compared with the chimpanzee and human genomes},
  journal = {Nature},
  year = {2012},
  volume = {486},
  pages = {527--531},
}

@ARTICLE{Quail2008,
  author = {Michael A Quail and Iwanka Kozarewa and Frances Smith and Aylwyn
	Scally and Philip J Stephens and Richard Durbin and Harold Swerdlow
	and Daniel J Turner},
  title = {A large genome center's improvements to the Illumina sequencing system.},
  journal = {Nat Methods},
  year = {2008},
  volume = {5},
  pages = {1005--1010},
  month = {Dec},
  abstract = {The Wellcome Trust Sanger Institute is one of the world's largest
	genome centers, and a substantial amount of our sequencing is performed
	with 'next-generation' massively parallel sequencing technologies:
	in June 2008 the quantity of purity-filtered sequence data generated
	by our Genome Analyzer (Illumina) platforms reached 1 terabase, and
	our average weekly Illumina production output is currently 64 gigabases.
	Here we describe a set of improvements we have made to the standard
	Illumina protocols to make the library preparation more reliable
	in a high-throughput environment, to reduce bias, tighten insert
	size distribution and reliably obtain high yields of data.},
  
  institution = {Wellcome Trust Sanger Institute, Wellcome Trust Genome Campus, Hinxton,
	Cambridgeshire, CB10 1SA, UK.},
  keywords = {Academies and Institutes; Chromosome Mapping; Equipment Design; Genomics;
	Polymerase Chain Reaction; Sequence Analysis, DNA},
  owner = {calkan},
  pii = {nmeth.1270},
  pmid = {19034268},
  timestamp = {2009.04.12},
}

@ARTICLE{Redon2006,
  author = {Richard Redon and Shumpei Ishikawa and Karen R Fitch and Lars Feuk
	and George H Perry and T. Daniel Andrews and Heike Fiegler and Michael
	H Shapero and Andrew R Carson and Wenwei Chen and Eun Kyung Cho and
	Stephanie Dallaire and Jennifer L Freeman and Juan R González and
	Mònica Gratacòs and Jing Huang and Dimitrios Kalaitzopoulos and Daisuke
	Komura and Jeffrey R MacDonald and Christian R Marshall and Rui Mei
	and Lyndal Montgomery and Kunihiro Nishimura and Kohji Okamura and
	Fan Shen and Martin J Somerville and Joelle Tchinda and Armand Valsesia
	and Cara Woodwark and Fengtang Yang and Junjun Zhang and Tatiana
	Zerjal and Jane Zhang and Lluis Armengol and Donald F Conrad and
	Xavier Estivill and Chris Tyler-Smith and Nigel P Carter and Hiroyuki
	Aburatani and Charles Lee and Keith W Jones and Stephen W Scherer
	and Matthew E Hurles},
  title = {Global variation in copy number in the human genome},
  journal = {Nature},
  year = {2006},
  volume = {444},
  pages = {444--454},
  month = {Nov},
  abstract = {Copy number variation (CNV) of DNA sequences is functionally significant
	but has yet to be fully ascertained. We have constructed a first-generation
	CNV map of the human genome through the study of 270 individuals
	from four populations with ancestry in Europe, Africa or Asia (the
	{HapMap} collection). DNA from these individuals was screened for
	CNV using two complementary technologies: single-nucleotide polymorphism
	(SNP) genotyping arrays, and clone-based comparative genomic hybridization.
	A total of 1,447 copy number variable regions (CNVRs), which can
	encompass overlapping or adjacent gains or losses, covering 360 megabases
	(12\% of the genome) were identified in these populations. These
	CNVRs contained hundreds of genes, disease loci, functional elements
	and segmental duplications. Notably, the CNVRs encompassed more nucleotide
	content per genome than SNPs, underscoring the importance of CNV
	in genetic diversity and evolution. The data obtained delineate linkage
	disequilibrium patterns for many CNVs, and reveal marked variation
	in copy number among populations. We also demonstrate the utility
	of this resource for genetic disease studies.},
  
  keywords = {Chromosome Mapping; Gene Dosage; Genetics, Population; Genome, Human;
	Genomics; Genotype; Humans; Linkage Disequilibrium; Molecular Diagnostic
	Techniques; Oligonucleotide Array Sequence Analysis; Polymorphism,
	Single Nucleotide; Variation (Genetics)},
  owner = {calkan},
  pii = {nature05329},
  pmid = {17122850},
  timestamp = {2007.05.11},
}

@ARTICLE{Reich2010,
  author = {David Reich and Richard E Green and Martin Kircher and Johannes Krause
    and Nick Patterson and Eric Y Durand and Bence Viola and Adrian W
    Briggs and Udo Stenzel and Philip L F Johnson and Tomislav Maricic
    and Jeffrey M Good and Tomas Marques-Bonet and Can Alkan and Qiaomei
    Fu and Swapan Mallick and Heng Li and Matthias Meyer and Evan E Eichler
    and Mark Stoneking and Michael Richards and Sahra Talamo and Michael
    V Shunkov and Anatoli P Derevianko and Jean-Jacques Hublin and Janet
    Kelso and Montgomery Slatkin and Svante Pääbo},
  title = {Genetic history of an archaic hominin group from {Denisova Cave}
	in {Siberia}.},
  journal = {Nature},
  year = {2010},
  volume = {468},
  pages = {1053--1060},
  month = {Dec},
  abstract = {Using DNA extracted from a finger bone found in Denisova Cave in southern
	Siberia, we have sequenced the genome of an archaic hominin to about
	1.9-fold coverage. This individual is from a group that shares a
	common origin with Neanderthals. This population was not involved
	in the putative gene flow from Neanderthals into Eurasians; however,
	the data suggest that it contributed 4-6\% of its genetic material
	to the genomes of present-day Melanesians. We designate this hominin
	population 'Denisovans' and suggest that it may have been widespread
	in Asia during the Late Pleistocene epoch. A tooth found in Denisova
	Cave carries a mitochondrial genome highly similar to that of the
	finger bone. This tooth shares no derived morphological features
	with Neanderthals or modern humans, further indicating that Denisovans
	have an evolutionary history distinct from Neanderthals and modern
	humans.},
  
  file = {main:Reich2010.pdf:PDF;supp_info:Reich2010-suppinfo.pdf:PDF;supp_table:Reich2010-supptable.xls:Excel},
  institution = {Department of Genetics, Harvard Medical School, Boston, Massachusetts
	02115, USA. reich@genetics.med.harvard.edu},
  keywords = {Animals; Asia; DNA, Mitochondrial, genetics; Europe; Finger Phalanges,
	chemistry; Fossils; Gene Flow; Genome, genetics; Hominidae, classification/genetics;
	Humans; Melanesia; Molecular Sequence Data; Phylogeny; Siberia; Tooth,
	anatomy /&/ histology/chemistry},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {nature09710},
  pmid = {21179161},
  timestamp = {2011.06.17},
}

@ARTICLE{Renton2011,
  author = {Alan E Renton and Elisa Majounie and Adrian Waite and Javier Simón-Sánchez
	and Sara Rollinson and J. Raphael Gibbs and Jennifer C Schymick and
	Hannu Laaksovirta and John C van Swieten and Liisa Myllykangas and
	Hannu Kalimo and Anders Paetau and Yevgeniya Abramzon and Anne M
	Remes and Alice Kaganovich and Sonja W Scholz and Jamie Duckworth
	and Jinhui Ding and Daniel W Harmer and Dena G Hernandez and Janel
	O Johnson and Kin Mok and Mina Ryten and Danyah Trabzuni and Rita
	J Guerreiro and Richard W Orrell and James Neal and Alex Murray and
	Justin Pearson and Iris E Jansen and David Sondervan and Harro Seelaar
	and Derek Blake and Kate Young and Nicola Halliwell and Janis Bennion
	Callister and Greg Toulson and Anna Richardson and Alex Gerhard and
	Julie Snowden and David Mann and David Neary and Michael A Nalls
	and Terhi Peuralinna and Lilja Jansson and Veli-Matti Isoviita and
	Anna-Lotta Kaivorinne and Maarit Hölttä-Vuori and Elina Ikonen and
	Raimo Sulkava and Michael Benatar and Joanne Wuu and Adriano Chiò
	and Gabriella Restagno and Giuseppe Borghero and Mario Sabatelli
	and The I. T. A. L. S. G. E. N. Consortium and David Heckerman and
	Ekaterina Rogaeva and Lorne Zinman and Jeffrey D Rothstein and Michael
	Sendtner and Carsten Drepper and Evan E Eichler and Can Alkan and
	Ziedulla Abdullaev and Svetlana D Pack and Amalia Dutra and Evgenia
	Pak and John Hardy and Andrew Singleton and Nigel M Williams and
	Peter Heutink and Stuart Pickering-Brown and Huw R Morris and Pentti
	J Tienari and Bryan J Traynor},
  title = {A Hexanucleotide Repeat Expansion in C9ORF72 Is the Cause of Chromosome
	9p21-Linked ALS-FTD.},
  journal = {Neuron},
  year = {2011},
  volume = {72},
  pages = {257--268},
  month = {Oct},
  abstract = {The chromosome 9p21 amyotrophic lateral sclerosis-frontotemporal dementia
	(ALS-FTD) locus contains one of the last major unidentified autosomal-dominant
	genes underlying these common neurodegenerative diseases. We have
	previously shown that a founder haplotype, covering the MOBKL2b,
	IFNK, and C9ORF72 genes, is present in the majority of cases linked
	to this region. Here we show that there is a large hexanucleotide
	(GGGGCC) repeat expansion in the first intron of C9ORF72 on the affected
	haplotype. This repeat expansion segregates perfectly with disease
	in the Finnish population, underlying 46.0\% of familial ALS and
	21.1\% of sporadic ALS in that population. Taken together with the
	D90A SOD1 mutation, 87\% of familial ALS in Finland is now explained
	by a simple monogenic cause. The repeat expansion is also present
	in one-third of familial ALS cases of outbred European descent, making
	it the most common genetic cause of these fatal neurodegenerative
	diseases identified to date.},
  
  file = {main:Renton2011.pdf:PDF;supp:Renton2011sup.pdf:PDF},
  institution = {Neuromuscular Diseases Research Unit, Laboratory of Neurogenetics,
	National Institute on Aging, National Institutes of Health, Bethesda,
	MD 20892, USA.},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {S0896-6273(11)00797-5},
  pmid = {21944779},
  timestamp = {2011.11.08},
}

@ARTICLE{Rozen2003,
  author = {Steve Rozen and Helen Skaletsky and Janet D Marszalek and Patrick
    J Minx and Holland S Cordum and Robert H Waterston and Richard K
    Wilson and David C Page},
  title = {Abundant gene conversion between arms of palindromes in human and
	ape {Y} chromosomes.},
  journal = {Nature},
  year = {2003},
  volume = {423},
  pages = {873--876},
  month = {Jun},
  abstract = {Eight palindromes comprise one-quarter of the euchromatic DNA of the
	male-specific region of the human Y chromosome, the MSY. They contain
	many testis-specific genes and typically exhibit 99.97\% intra-palindromic
	(arm-to-arm) sequence identity. This high degree of identity could
	be interpreted as evidence that the palindromes arose through duplication
	events that occurred about 100,000 years ago. Using comparative sequencing
	in great apes, we demonstrate here that at least six of these MSY
	palindromes predate the divergence of the human and chimpanzee lineages,
	which occurred about 5 million years ago. The arms of these palindromes
	must have subsequently engaged in gene conversion, driving the paired
	arms to evolve in concert. Indeed, analysis of MSY palindrome sequence
	variation in existing human populations provides evidence of recurrent
	arm-to-arm gene conversion in our species. We conclude that during
	recent evolution, an average of approximately 600 nucleotides per
	newborn male have undergone Y-Y gene conversion, which has had an
	important role in the evolution of multi-copy testis gene families
	in the MSY.},
  
  keywords = {Animals; Base Sequence; Chromosomes, Human, Y; Chromosomes, Mammalian;
	Euchromatin; Evolution, Molecular; Gene Amplification; Gene Conversion;
	Gene Duplication; Gorilla gorilla; Hominidae; Humans; Male; Molecular
	Sequence Data; Multigene Family; Mutagenesis; Organ Specificity;
	Pan paniscus; Pan troglodytes; Sex Characteristics; Sex Determination
	(Genetics); Testis; Y Chromosome},
  owner = {calkan},
  pii = {nature01723},
  pmid = {12815433},
  timestamp = {2008.10.05},
}

@ARTICLE{tools,
  author = {Ruffalo and and et al.},
  title = {Comparative analysis of algorithms for next-generation sequencing
	read alignment},
  year = {2011},
  institution = {Bioinformatics}
}

@ARTICLE{shrimp,
  author = {Stephen M. Rumble and Phil Lacroute and Adrian V. Dalca and Marc
	Fiume and Arend Sidow and Michael Brudno},
  title = {SHRiMP: Accurate Mapping of Short Color-space Reads},
  journal = {PLoS Comput Biol},
  title = {SHRiMP: Accurate Mapping of Short Color-space Reads},
  year = {2009},
  month = {05},
  volume = {5},
  pages = {e1000386},

}

@ARTICLE{Salari2009,
  author = {Raheleh Salari and Cagri Aksay and Emre Karakoc and Peter J Unrau
	and Iman Hajirasouliha and S. Cenk Sahinalp},
  title = {{smyRNA}: a novel {Ab} initio {ncRNA} gene finder.},
  journal = {PLoS One},
  year = {2009},
  volume = {4},
  pages = {e5433},
  abstract = {BACKGROUND: Non-coding RNAs (ncRNAs) have important functional roles
	in the cell: for example, they regulate gene expression by means
	of establishing stable joint structures with target mRNAs via complementary
	sequence motifs. Sequence motifs are also important determinants
	of the structure of ncRNAs. Although ncRNAs are abundant, discovering
	novel ncRNAs on genome sequences has proven to be a hard task; in
	particular past attempts for ab initio ncRNA search mostly failed
	with the exception of tools that can identify micro RNAs. METHODOLOGY/PRINCIPAL
	FINDINGS: We present a very general ab initio ncRNA gene finder that
	exploits differential distributions of sequence motifs between ncRNAs
	and background genome sequences. CONCLUSIONS/SIGNIFICANCE: Our method,
	once trained on a set of ncRNAs from a given species, can be applied
	to a genome sequences of other organisms to find not only ncRNAs
	homologous to those in the training set but also others that potentially
	belong to novel (and perhaps unknown) ncRNA families. AVAILABILITY:
	(http://compbio.cs.sfu.ca/taverna/smyrna).},
  
  institution = {School of Computing Science, Simon Fraser University, Burnaby, British
	Columbia, Canada.},
  keywords = {Algorithms; Artificial Intelligence; Computational Biology, methods;
	Conserved Sequence; RNA, Untranslated, genetics; Sequence Homology,
	Nucleic Acid},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pmid = {19415115},
  timestamp = {2010.09.15},
}

@ARTICLE{Scally2012,
  author = {Aylwyn Scally and Julien Y Dutheil and LaDeana W Hillier and Gregory
    E Jordan and Ian Goodhead and Javier Herrero and Asger Hobolth and
    Tuuli Lappalainen and Thomas Mailund and Tomas Marques-Bonet and
    Shane McCarthy and Stephen H Montgomery and Petra C Schwalie and
    Y. Amy Tang and Michelle C Ward and Yali Xue and Bryndis Yngvadottir
    and Can Alkan and Lars N Andersen and Qasim Ayub and Edward V Ball
    and Kathryn Beal and Brenda J Bradley and Yuan Chen and Chris M Clee
    and Stephen Fitzgerald and Tina A Graves and Yong Gu and Paul Heath
    and Andreas Heger and others},
  title = {Insights into hominid evolution from the gorilla genome sequence.},
  journal = {Nature},
  year = {2012},
  volume = {483},
  pages = {169--175},
  month = {Mar},
  abstract = {Gorillas are humans' closest living relatives after chimpanzees, and
	are of comparable importance for the study of human origins and evolution.
	Here we present the assembly and analysis of a genome sequence for
	the western lowland gorilla, and compare the whole genomes of all
	extant great ape genera. We propose a synthesis of genetic and fossil
	evidence consistent with placing the human-chimpanzee and human-chimpanzee-gorilla
	speciation events at approximately 6 and 10 million years ago. In
	30\% of the genome, gorilla is closer to human or chimpanzee than
	the latter are to each other; this is rarer around coding genes,
	indicating pervasive selection throughout great ape evolution, and
	has functional consequences in gene expression. A comparison of protein
	coding genes reveals approximately 500 genes showing accelerated
	evolution on each of the gorilla, human and chimpanzee lineages,
	and evidence for parallel acceleration, particularly of genes involved
	in hearing. We also compare the western and eastern gorilla species,
	estimating an average sequence divergence time 1.75 million years
	ago, but with evidence for more recent genetic exchange and a population
	bottleneck in the eastern species. The use of the genome sequence
	in these and future analyses will promote a deeper understanding
	of great ape biology and evolution.},
  
  file = {Published version:Scally2012.pdf:PDF},
  institution = {Wellcome Trust Sanger Institute, Wellcome Trust Genome Campus, Hinxton
	CB10 1SA, UK.},
  language = {eng},
  medline-pst = {epublish},
  owner = {calkan},
  pii = {nature10842},
  pmid = {22398555},
  timestamp = {2012.03.14},
}

@ARTICLE{Scharpf2007,
  author = {Robert B Scharpf and Jason C Ting and Jonathan Pevsner and Ingo Ruczinski},
  title = {{SNP}chip: {R} classes and methods for {SNP} array data.},
  journal = {Bioinformatics},
  year = {2007},
  volume = {23},
  pages = {627--628},
  month = {Mar},
  abstract = {High-density single nucleotide polymorphism microarrays (SNP chips)
	provide information on a subject's genome, such as copy number and
	genotype (heterozygosity/homozygosity) at a SNP. While fluorescence
	in situ hybridization and karyotyping reveal many abnormalities,
	SNP chips provide a higher resolution map of the human genome that
	can be used to detect, e.g., aneuploidies, microdeletions, microduplications
	and loss of heterozygosity (LOH). As a variety of diseases are linked
	to such chromosomal abnormalities, SNP chips promise new insights
	for these diseases by aiding in the discovery of such regions, and
	may suggest targets for intervention. The R package SNPchip contains
	classes and methods useful for storing, visualizing and analyzing
	high density SNP data. Originally developed from the SNPscan web-tool,
	SNPchip utilizes S4 classes and extends other open source R tools
	available at Bioconductor. This has numerous advantages, including
	the ability to build statistical models for SNP-level data that operate
	on instances of the class, and to communicate with other R packages
	that add additional functionality. AVAILABILITY: The package is available
	from the Bioconductor web page at www.bioconductor.org. SUPPLEMENTARY
	INFORMATION: The supplementary material as described in this article
	(case studies, installation guidelines and R code) is available from
	http://biostat.jhsph.edu/~iruczins/publications/sm/},
  
  institution = {Department of Biostatistics, Johns Hopkins Bloomberg School of Public
	Health, Baltimore, MD 21205, USA.},
  keywords = {Models, Statistical; Oligonucleotide Array Sequence Analysis; Polymorphism,
	Single Nucleotide; Software},
  owner = {calkan},
  pii = {btl638},
  pmid = {17204461},
  timestamp = {2009.10.25},
}

@ARTICLE{Schueler2005,
  author = {Mary G Schueler and John M Dunn and Christine P Bird and Mark T Ross
	and Luigi Viggiano and N. I. S. C. Comparative Sequencing Program
	and Mariano Rocchi and Huntington F Willard and Eric D Green},
  title = {Progressive proximal expansion of the primate {X} chromosome centromere.},
  journal = {Proc Natl Acad Sci U S A},
  year = {2005},
  volume = {102},
  pages = {10563--10568},
  month = {Jul},
  abstract = {Previous studies of the pericentromeric region of the human {X} chromosome
	short arm (Xp) revealed an age gradient from ancient {DNA} that contains
	expressed genes to recent human-specific {DNA} at the functional
	centromere. We analyzed the finished sequence of this human genomic
	region to investigate its evolutionary history. Phylogenetic analysis
	of >1,500 alpha-satellite monomers from the region revealed the presence
	of five physical domains, each containing monomers from a distinct
	phylogenetic clade. The most distal domain contains long interspersed
	nucleotide element repeats that were active >35 million years ago,
	whereas the four proximal domains contain more recently active long
	interspersed nucleotide element repeats. An out-of-register, unequal
	recombination (i.e., crossover) detected at the edge of the {X} chromosome-specific
	alpha-satellite array (DXZ1) may reflect the most recent of a series
	of punctuating events during evolution that resulted in a proximal
	physical expansion of the {X} centromere. The first 18 kb of this
	array has 97-99\% pairwise identity among all 2-kb repeat units.
	To perform more detailed evolutionary comparisons, we sequenced the
	junction between the ancient {DNA} of Xp and the primate-specific
	alpha satellite in chimpanzee, gorilla, orangutan, vervet, macaque,
	and baboon. The striking conservation found in all cases supports
	the ancestral nature of the alpha satellite at this location. These
	studies demonstrate that the primate {X} centromere appears to have
	evolved through repeated expansion events occurring within the central,
	active region of centromeric {DNA}, with the newly added sequences
	then conferring centromere function.},
  
  keywords = {Animals; Base Sequence; Centromere; Chromosomes, Human, X; Cluster
	Analysis; Conserved Sequence; {DNA} Repeat Expansion; Evolution,
	Molecular; Humans; Interspersed Repetitive Sequences; Molecular Sequence
	Data; Phylogeny; Primates; Sequence Analysis, {DNA}},
  owner = {calkan},
  pii = {0503346102},
  pmid = {16030148},
  timestamp = {2007.04.11},
}

@ARTICLE{Schuster2010,
  author = {Stephan C Schuster and Webb Miller and Aakrosh Ratan and Lynn P Tomsho
    and Belinda Giardine and Lindsay R Kasson and Robert S Harris and
    Desiree C Petersen and Fangqing Zhao and Ji Qi and Can Alkan and
    Jeffrey M Kidd and Yazhou Sun and Daniela I Drautz and Pascal Bouffard
    and Donna M Muzny and Jeffrey G Reid and Lynne V Nazareth and Qingyu
    Wang and Richard Burhans and Cathy Riemer and Nicola E Wittekindt
    and Priya Moorjani and Elizabeth A Tindall and Charles G Danko and
    Wee Siang Teo and Anne M Buboltz and Zhenhai Zhang and Qianyi Ma
    and Arno Oosthuysen and others},
  title = {Complete {K}hoisan and {B}antu genomes from southern {A}frica.},
  journal = {Nature},
  year = {2010},
  volume = {463},
  pages = {943--947},
  month = {Feb},
  abstract = {The genetic structure of the indigenous hunter-gatherer peoples of
	southern Africa, the oldest known lineage of modern human, is important
	for understanding human diversity. Studies based on mitochondrial
	and small sets of nuclear markers have shown that these hunter-gatherers,
	known as Khoisan, San, or Bushmen, are genetically divergent from
	other humans. However, until now, fully sequenced human genomes have
	been limited to recently diverged populations. Here we present the
	complete genome sequences of an indigenous hunter-gatherer from the
	Kalahari Desert and a Bantu from southern Africa, as well as protein-coding
	regions from an additional three hunter-gatherers from disparate
	regions of the Kalahari. We characterize the extent of whole-genome
	and exome diversity among the five men, reporting 1.3 million novel
	DNA differences genome-wide, including 13,146 novel amino acid variants.
	In terms of nucleotide substitutions, the Bushmen seem to be, on
	average, more different from each other than, for example, a European
	and an Asian. Observed genomic differences between the hunter-gatherers
	and others may help to pinpoint genetic adaptations to an agricultural
	lifestyle. Adding the described variants to current databases will
	facilitate inclusion of southern Africans in medical research efforts,
	particularly when family and medical histories can be correlated
	with genome-wide data.},
  
  file = {main:Schuster2010.pdf:PDF;supp:Schuster2010-supp.pdf:PDF},
  institution = {Pennsylvania State University, Center for Comparative Genomics and
	Bioinformatics, 310 Wartik Lab, University Park, Pennsylvania 16802,
	USA. scs@bx.psu.edu},
  keywords = {African Continental Ancestry Group, genetics; Asian Continental Ancestry
	Group, genetics; Ethnic Groups, genetics; European Continental Ancestry
	Group, genetics; Exons, genetics; Genetics, Medical; Genome, Human,
	genetics; Humans; Phylogeny; Polymorphism, Single Nucleotide, genetics;
	South Africa, ethnology},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {nature08795},
  pmid = {20164927},
  timestamp = {2010.09.15},
}

@ARTICLE{Sharp2006,
  author = {Andrew J Sharp and Ze Cheng and Evan E Eichler},
  title = {Structural variation of the human genome},
  journal = {Annu Rev Genomics Hum Genet},
  year = {2006},
  volume = {7},
  pages = {407--442},
  abstract = {There is growing appreciation that the human genome contains significant
	numbers of structural rearrangements, such as insertions, deletions,
	inversions, and large tandem repeats. Recent studies have defined
	approximately 5\% of the human genome as structurally variant in
	the normal population, involving more than 800 independent genes.
	We present a detailed review of the various structural rearrangements
	identified to date in humans, with particular reference to their
	influence on human phenotypic variation. Our current knowledge of
	the extent of human structural variation shows that the human genome
	is a highly dynamic structure that shows significant large-scale
	variation from the currently published genome reference sequence.},
  
  owner = {calkan},
  pmid = {16780417},
  timestamp = {2007.05.11},
}

@ARTICLE{Sharp2006a,
  author = {Andrew J Sharp and Sierra Hansen and Rebecca R Selzer and Ze Cheng
	and Regina Regan and Jane A Hurst and Helen Stewart and Sue M Price
	and Edward Blair and Raoul C Hennekam and Carrie A Fitzpatrick and
	Rick Segraves and Todd A Richmond and Cheryl Guiver and Donna G Albertson
	and Daniel Pinkel and Peggy S Eis and Stuart Schwartz and Samantha
	J L Knight and Evan E Eichler},
  title = {Discovery of previously unidentified genomic disorders from the duplication
	architecture of the human genome.},
  journal = {Nat Genet},
  year = {2006},
  volume = {38},
  pages = {1038--1042},
  month = {Sep},
  abstract = {Genomic disorders are characterized by the presence of flanking segmental
	duplications that predispose these regions to recurrent rearrangement.
	Based on the duplication architecture of the genome, we investigated
	130 regions that we hypothesized as candidates for previously undescribed
	genomic disorders. We tested 290 individuals with mental retardation
	by BAC array comparative genomic hybridization and identified 16
	pathogenic rearrangements, including de novo microdeletions of 17q21.31
	found in four individuals. Using oligonucleotide arrays, we refined
	the breakpoints of this microdeletion, defining a 478-kb critical
	region containing six genes that were deleted in all four individuals.
	We mapped the breakpoints of this deletion and of four other pathogenic
	rearrangements in 1q21.1, 15q13, 15q24 and 17q12 to flanking segmental
	duplications, suggesting that these are also sites of recurrent rearrangement.
	In common with the 17q21.31 deletion, these breakpoint regions are
	sites of copy number polymorphism in controls, indicating that these
	may be inherently unstable genomic regions.},
  
  institution = {Department of Genome Sciences and The Howard Hughes Medical Institute,
	University of Washington School of Medicine, 1705 NE Pacific St.,
	Seattle, Washington 98195, USA.},
  keywords = {Chromosome Breakage; Chromosome Deletion; Chromosomes, Artificial,
	Bacterial; Chromosomes, Human, Pair 17; Gene Dosage; Gene Duplication;
	Gene Rearrangement; Genome, Human; Heterozygote; Humans; In Situ
	Hybridization, Fluorescence; Mental Retardation; Mosaicism; Nucleic
	Acid Hybridization; Oligonucleotide Array Sequence Analysis; Physical
	Chromosome Mapping; Polymorphism, Genetic},
  owner = {calkan},
  pii = {ng1862},
  pmid = {16906162},
  timestamp = {2009.10.26},
}

@ARTICLE{Sharp2007,
  author = {Andrew J Sharp and Andy Itsara and Ze Cheng and Can Alkan and Stuart
	Schwartz and Evan E Eichler},
  title = {Optimal design of oligonucleotide microarrays for measurement of
	{DNA} copy-number.},
  journal = {Hum Mol Genet},
  year = {2007},
  volume = {16},
  pages = {2770--2779},
  month = {Nov},
  abstract = {Copy-number variants (CNVs) occur frequently within the human genome,
	and may be associated with many human phenotypes. If disease association
	studies of CNVs are to be performed routinely, it is essential that
	the copy-number status be accurately genotyped. We systematically
	assessed the dynamic range response of an oligonucleotide microarray
	platform to accurately predict copy-number in a set of seven patients
	who had previously been shown to carry between 1 and 6 copies of
	an approximately 4 Mb region of 15q12.2-q13.1. We identify probe
	uniqueness, probe length, uniformity of probe melting temperature,
	overlap with SNPs and common repeats (particularly Alu elements)
	and guanine homopolymer content as parameters that significantly
	affect probe performance. Further, we prove the influence of these
	criteria on array performance by using these parameters to prospectively
	filter data from a second array design covering an independent genomic
	region and observing significant improvements in data quality. The
	informed selection of probes which have superior performance characteristics
	allows the prospective design of oligonucleotide arrays which show
	increased sensitivity and specificity compared with current designs.
	Although based on the analysis of data from comparative genomic hybridization
	experiments, we anticipate that our results are relevant to the design
	of improved oligonucleotide arrays for high-throughput copy-number
	genotyping of complex regions of the human genome.},
  
  file = {main:Sharp2007.pdf:PDF;supp:Sharp2007-supp.pdf:PDF},
  keywords = {Chromosomes, Human, Pair 15; DNA; DNA Probes; Gene Dosage; Gene Expression
	Profiling; Genome, Human; Humans; Nucleic Acid Hybridization; Oligonucleotide
	Array Sequence Analysis; Polymorphism, Single Nucleotide; Research
	Design; Sensitivity and Specificity},
  owner = {calkan},
  pii = {ddm234},
  pmid = {17725982},
  timestamp = {2008.10.05},
}

@ARTICLE{Sharp2005,
  author = {Andrew J Sharp and Devin P Locke and Sean D McGrath and Ze Cheng
	and Jeffrey A Bailey and Rhea U Vallente and Lisa M Pertz and Royden
	A Clark and Stuart Schwartz and Rick Segraves and Vanessa V Oseroff
	and Donna G Albertson and Daniel Pinkel and Evan E Eichler},
  title = {Segmental duplications and copy-number variation in the human genome.},
  journal = {Am J Hum Genet},
  year = {2005},
  volume = {77},
  pages = {78--88},
  month = {Jul},
  abstract = {The human genome contains numerous blocks of highly homologous duplicated
	sequence. This higher-order architecture provides a substrate for
	recombination and recurrent chromosomal rearrangement associated
	with genomic disease. However, an assessment of the role of segmental
	duplications in normal variation has not yet been made. On the basis
	of the duplication architecture of the human genome, we defined a
	set of 130 potential rearrangement hotspots and constructed a targeted
	bacterial artificial chromosome (BAC) microarray (with 2,194 BACs)
	to assess copy-number variation in these regions by array comparative
	genomic hybridization. Using our segmental duplication BAC microarray,
	we screened a panel of 47 normal individuals, who represented populations
	from four continents, and we identified 119 regions of copy-number
	polymorphism (CNP), 73 of which were previously unreported. We observed
	an equal frequency of duplications and deletions, as well as a 4-fold
	enrichment of CNPs within hotspot regions, compared with control
	BACs (P < .000001), which suggests that segmental duplications are
	a major catalyst of large-scale variation in the human genome. Importantly,
	segmental duplications themselves were also significantly enriched
	>4-fold within regions of CNP. Almost without exception, CNPs were
	not confined to a single population, suggesting that these either
	are recurrent events, having occurred independently in multiple founders,
	or were present in early human populations. Our study demonstrates
	that segmental duplications define hotspots of chromosomal rearrangement,
	likely acting as mediators of normal variation as well as genomic
	disease, and it suggests that the consideration of genomic architecture
	can significantly improve the ascertainment of large-scale rearrangements.
	Our specialized segmental duplication BAC microarray and associated
	database of structural polymorphisms will provide an important resource
	for the future characterization of human genomic disorders.},
  
  institution = {Department of Genome Sciences, University of Washington School of
	Medicine, Seattle, WA 98195, USA.},
  keywords = {Chromosomes, Artificial, Bacterial; Gene Dosage; Genetic Variation;
	Genome, Human; Humans; Nucleic Acid Hybridization, methods; Oligonucleotide
	Array Sequence Analysis, methods; Polymorphism, Genetic; Recombination,
	Genetic; Repetitive Sequences, Nucleic Acid; Reproducibility of Results},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {S0002-9297(07)60903-3},
  pmid = {15918152},
  timestamp = {2011.07.19},
}

@ARTICLE{She2004b,
  author = {Xinwei She and Julie E Horvath and Zhaoshi Jiang and Ge Liu and Terrence
	S Furey and Laurie Christ and Royden Clark and Tina Graves and Cassy
	L Gulden and Can Alkan and Jeff A Bailey and Cenk Sahinalp and Mariano
	Rocchi and David Haussler and Richard K Wilson and Webb Miller and
	Stuart Schwartz and Evan E Eichler},
  title = {The structure and evolution of centromeric transition regions within
	the human genome.},
  journal = {Nature},
  year = {2004},
  volume = {430},
  pages = {857--864},
  month = {Aug},
  abstract = {An understanding of how centromeric transition regions are organized
	is a critical aspect of chromosome structure and function; however,
	the sequence context of these regions has been difficult to resolve
	on the basis of the draft genome sequence. We present a detailed
	analysis of the structure and assembly of all human pericentromeric
	regions (5 megabases). Most chromosome arms (35 out of 43) show a
	gradient of dwindling transcriptional diversity accompanied by an
	increasing number of interchromosomal duplications in proximity to
	the centromere. At least 30\% of the centromeric transition region
	structure originates from euchromatic gene-containing segments of
	DNA that were duplicatively transposed towards pericentromeric regions
	at a rate of six-seven events per million years during primate evolution.
	This process has led to the formation of a minimum of 28 new transcripts
	by exon exaptation and exon shuffling, many of which are primarily
	expressed in the testis. The distribution of these duplicated segments
	is nonrandom among pericentromeric regions, suggesting that some
	regions have served as preferential acceptors of euchromatic DNA.},
  
  file = {main:She2004b.pdf:PDF},
  institution = {Department of Genetics, Center for Computational Genomics and the
	Center for Human Genetics, Case Western Reserve University School
	of Medicine and University Hospitals of Cleveland, Cleveland, Ohio
	44106, USA.},
  keywords = {Animals; Base Composition; Centromere, chemistry/genetics; DNA, chemistry/genetics;
	Euchromatin, chemistry/genetics; Evolution, Molecular; Expressed
	Sequence Tags; Gene Duplication; Genome, Human; Humans; RNA, Messenger,
	analysis/genetics; Transcription, Genetic, genetics},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {nature02806},
  pmid = {15318213},
  timestamp = {2011.06.17},
}

@ARTICLE{genome_sequence_0,
  author = {Jay Shendure and Hanlee Ji},
  title = {Next-generation DNA sequencing},
  year = {2008},
  institution = {Nature Biotechnology}
}

@ARTICLE{Shendure2008,
  author = {Jay Shendure and Hanlee Ji},
  title = {Next-generation {DNA} sequencing.},
  journal = {Nat Biotechnol},
  year = {2008},
  volume = {26},
  pages = {1135--1145},
  month = {Oct},
  abstract = {DNA sequence represents a single format onto which a broad range of
	biological phenomena can be projected for high-throughput data collection.
	Over the past three years, massively parallel DNA sequencing platforms
	have become widely available, reducing the cost of DNA sequencing
	by over two orders of magnitude, and democratizing the field by putting
	the sequencing capacity of a major genome center in the hands of
	individual investigators. These new technologies are rapidly evolving,
	and near-term challenges include the development of robust protocols
	for generating sequencing libraries, building effective new approaches
	to data-analysis, and often a rethinking of experimental design.
	Next-generation DNA sequencing has the potential to dramatically
	accelerate biological and biomedical research, by enabling the comprehensive
	analysis of genomes, transcriptomes and interactomes to become inexpensive,
	routine and widespread, rather than requiring significant production-scale
	efforts.},
  
  institution = {Department of Genome Sciences, University of Washington, Seattle,
	Washington 98195-5065, USA. shendure@u.washington.edu},
  keywords = {Chromosome Mapping; Forecasting; Genomics; Sequence Alignment; Sequence
	Analysis, DNA},
  owner = {calkan},
  pii = {nbt1486},
  pmid = {18846087},
  timestamp = {2009.04.25},
}

@ARTICLE{Shendure2004,
  author = {Jay Shendure and Robi D Mitra and Chris Varma and George M Church},
  title = {Advanced sequencing technologies: methods and goals.},
  journal = {Nat Rev Genet},
  year = {2004},
  volume = {5},
  pages = {335--344},
  month = {May},
  
  keywords = {{DNA}; Genome, Human; Humans; Sequence Analysis, {DNA}},
  owner = {calkan},
  pii = {nrg1325},
  pmid = {15143316},
  timestamp = {2007.05.08},
}

@ARTICLE{Simpson2009,
  author = {Jared T Simpson and Kim Wong and Shaun D Jackman and Jacqueline E
	Schein and Steven J M Jones and Inanç Birol},
  title = {{ABySS}: a parallel assembler for short read sequence data.},
  journal = {Genome Res},
  year = {2009},
  volume = {19},
  pages = {1117--1123},
  month = {Jun},
  abstract = {Widespread adoption of massively parallel deoxyribonucleic acid (DNA)
	sequencing instruments has prompted the recent development of de
	novo short read assembly algorithms. A common shortcoming of the
	available tools is their inability to efficiently assemble vast amounts
	of data generated from large-scale sequencing projects, such as the
	sequencing of individual human genomes to catalog natural genetic
	variation. To address this limitation, we developed ABySS (Assembly
	By Short Sequences), a parallelized sequence assembler. As a demonstration
	of the capability of our software, we assembled 3.5 billion paired-end
	reads from the genome of an African male publicly released by Illumina,
	Inc. Approximately 2.76 million contigs > or =100 base pairs (bp)
	in length were created with an N50 size of 1499 bp, representing
	68\% of the reference human genome. Analysis of these contigs identified
	polymorphic and novel sequences not present in the human reference
	assembly, which were validated by alignment to alternate human assemblies
	and to other primate genomes.},
  
  file = {main:Simpson2009.pdf:PDF},
  institution = {Genome Sciences Centre, British Columbia Cancer Agency, Vancouver,
	British Columbia V5Z 4E6, Canada.},
  keywords = {Algorithms; Animals; Computational Biology, methods; Contig Mapping;
	Escherichia coli K12, genetics; Genetic Variation; Genome, Human;
	Humans; Polymorphism, Genetic; Reproducibility of Results; Sequence
	Analysis, DNA, methods; Software},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {gr.089532.108},
  pmid = {19251739},
  timestamp = {2010.09.15},
}

@ARTICLE{Skaletsky2003,
  author = {Helen Skaletsky and Tomoko Kuroda-Kawaguchi and Patrick J Minx and
	Holland S Cordum and LaDeana Hillier and Laura G Brown and Sjoerd
	Repping and Tatyana Pyntikova and Johar Ali and Tamberlyn Bieri and
	Asif Chinwalla and Andrew Delehaunty and Kim Delehaunty and Hui Du
	and Ginger Fewell and Lucinda Fulton and Robert Fulton and Tina Graves
	and Shun-Fang Hou and Philip Latrielle and Shawn Leonard and Elaine
	Mardis and Rachel Maupin and John McPherson and Tracie Miner and
	William Nash and Christine Nguyen and Philip Ozersky and Kymberlie
	Pepin and Susan Rock and Tracy Rohlfing and Kelsi Scott and Brian
	Schultz and Cindy Strong and Aye Tin-Wollam and Shiaw-Pyng Yang and
	Robert H Waterston and Richard K Wilson and Steve Rozen and David
	C Page},
  title = {The male-specific region of the human {Y} chromosome is a mosaic
	of discrete sequence classes.},
  journal = {Nature},
  year = {2003},
  volume = {423},
  pages = {825--837},
  month = {Jun},
  abstract = {The male-specific region of the Y chromosome, the MSY, differentiates
	the sexes and comprises 95\% of the chromosome's length. Here, we
	report that the MSY is a mosaic of heterochromatic sequences and
	three classes of euchromatic sequences: X-transposed, X-degenerate
	and ampliconic. These classes contain all 156 known transcription
	units, which include 78 protein-coding genes that collectively encode
	27 distinct proteins. The X-transposed sequences exhibit 99\% identity
	to the X chromosome. The X-degenerate sequences are remnants of ancient
	autosomes from which the modern X and Y chromosomes evolved. The
	ampliconic class includes large regions (about 30\% of the MSY euchromatin)
	where sequence pairs show greater than 99.9\% identity, which is
	maintained by frequent gene conversion (non-reciprocal transfer).
	The most prominent features here are eight massive palindromes, at
	least six of which contain testis genes.},
  
  keywords = {Chromosomes, Human, X; Chromosomes, Human, Y; Crossing Over, Genetic;
	DNA Transposable Elements; Euchromatin; Evolution, Molecular; Female;
	Gene Amplification; Gene Conversion; Genes; Heterochromatin; Humans;
	In Situ Hybridization, Fluorescence; Male; Models, Genetic; Multigene
	Family; Organ Specificity; Pseudogenes; Sequence Homology, Nucleic
	Acid; Sex Characteristics; Sex Determination (Genetics); Species
	Specificity; Testis; Transcription, Genetic; Transducin},
  owner = {calkan},
  pii = {nature01722},
  pmid = {12815422},
  timestamp = {2008.10.05},
}

@ARTICLE{Smith2008,
  author = {Douglas R Smith and Aaron R Quinlan and Heather E Peckham and Kathryn
	Makowsky and Wei Tao and Betty Woolf and Lei Shen and William F Donahue
	and Nadeem Tusneem and Michael P Stromberg and Donald A Stewart and
	Lu Zhang and Swati S Ranade and Jason B Warner and Clarence C Lee
	and Brittney E Coleman and Zheng Zhang and Stephen F McLaughlin and
	Joel A Malek and Jon M Sorenson and Alan P Blanchard and Jarrod Chapman
	and David Hillman and Feng Chen and Daniel S Rokhsar and Kevin J
	McKernan and Thomas W Jeffries and Gabor T Marth and Paul M Richardson},
  title = {Rapid whole-genome mutational profiling using next-generation sequencing
	technologies.},
  journal = {Genome Res},
  year = {2008},
  volume = {18},
  pages = {1638--1642},
  month = {Oct},
  abstract = {Forward genetic mutational studies, adaptive evolution, and phenotypic
	screening are powerful tools for creating new variant organisms with
	desirable traits. However, mutations generated in the process cannot
	be easily identified with traditional genetic tools. We show that
	new high-throughput, massively parallel sequencing technologies can
	completely and accurately characterize a mutant genome relative to
	a previously sequenced parental (reference) strain. We studied a
	mutant strain of Pichia stipitis, a yeast capable of converting xylose
	to ethanol. This unusually efficient mutant strain was developed
	through repeated rounds of chemical mutagenesis, strain selection,
	transformation, and genetic manipulation over a period of seven years.
	We resequenced this strain on three different sequencing platforms.
	Surprisingly, we found fewer than a dozen mutations in open reading
	frames. All three sequencing technologies were able to identify each
	single nucleotide mutation given at least 10-15-fold nominal sequence
	coverage. Our results show that detecting mutations in evolved and
	engineered organisms is rapid and cost-effective at the whole-genome
	level using new sequencing technologies. Identification of specific
	mutations in strains with altered phenotypes will add insight into
	specific gene functions and guide further metabolic engineering efforts.},
  
  file = {main:Smith2008.pdf:PDF;supp:Smith2008-supp.doc:Word},
  institution = {Agencourt Bioscience Corporation, Beverly, Massachusetts 01915, USA.
	douglas.smith@agencourt.com},
  keywords = {DNA Mutational Analysis; Genome, Fungal; Mutation; Pichia; Sequence
	Alignment; Sequence Analysis, DNA},
  owner = {calkan},
  pii = {gr.077776.108},
  pmid = {18775913},
  timestamp = {2009.01.20},
}

@ARTICLE{sw,
  author = {Temple F. Smith and Michael S. Waterman},
  title = {Identification of Common Molecular Subsequences},
  journal = {Journal of Molecular Biology},
  year = {1981},
  volume = {147},
  pages = {195--195}
}

@ARTICLE{Sudmant2010,
  author = {Peter H Sudmant and Jacob O Kitzman and Francesca Antonacci and Can
	Alkan and Maika Malig and Anya Tsalenko and Nick Sampas and Laurakay
	Bruhn and Jay Shendure and 1000 Genomes Project and Evan E Eichler},
  title = {Diversity of human copy number variation and multicopy genes.},
  journal = {Science},
  year = {2010},
  volume = {330},
  pages = {641--646},
  month = {Oct},
  abstract = {Copy number variants affect both disease and normal phenotypic variation,
	but those lying within heavily duplicated, highly identical sequence
	have been difficult to assay. By analyzing short-read mapping depth
	for 159 human genomes, we demonstrated accurate estimation of absolute
	copy number for duplications as small as 1.9 kilobase pairs, ranging
	from 0 to 48 copies. We identified 4.1 million "singly unique nucleotide"
	positions informative in distinguishing specific copies and used
	them to genotype the copy and content of specific paralogs within
	highly duplicated gene families. These data identify human-specific
	expansions in genes associated with brain development, reveal extensive
	population genetic diversity, and detect signatures consistent with
	gene conversion in the human species. Our approach makes ~1000 genes
	accessible to genetic studies of disease association.},
  
  file = {main:Sudmant2010.pdf:PDF;supp:Sudmant2010-supp.pdf:PDF},
  institution = {Department of Genome Sciences, University of Washington School of
	Medicine, Seattle, WA 98195, USA.},
  keywords = {Chromosome Mapping; Continental Population Groups, genetics; DNA Copy
	Number Variations; Databases, Nucleic Acid; Evolution, Molecular;
	Female; Gene Conversion; Gene Dosage; Gene Duplication; Gene Frequency;
	Genes, Duplicate; Genetic Variation; Genome, Human; Genomics, methods;
	Genotype; Haplotypes; Humans; Male; Polymorphism, Single Nucleotide;
	Sequence Analysis, DNA},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {330/6004/641},
  pmid = {21030649},
  timestamp = {2011.06.17},
}

@ARTICLE{Suk2011,
  author = {Eun-Kyung Suk and Gayle K McEwen and Jorge Duitama and Katja Nowick
	and Sabrina Schulz and Stefanie Palczewski and Stefan Schreiber and
	Dustin T Holloway and Stephen McLaughlin and Heather Peckham and
	Clarence Lee and Thomas Huebsch and Margret R Hoehe},
  title = {A comprehensively molecular haplotype-resolved genome of a European
	individual.},
  journal = {Genome Res},
  year = {2011},
  volume = {21},
  pages = {1672--1685},
  month = {Oct},
  abstract = {Independent determination of both haplotype sequences of an individual
	genome is essential to relate genetic variation to genome function,
	phenotype, and disease. To address the importance of phase, we have
	generated the most complete haplotype-resolved genome to date, "Max
	Planck One" (MP1), by fosmid pool-based next generation sequencing.
	Virtually all SNPs (>99\%) and 80,000 indels were phased into haploid
	sequences of up to 6.3 Mb (N50 ~1 Mb). The completeness of phasing
	allowed determination of the concrete molecular haplotype pairs for
	the vast majority of genes (81\%) including potential regulatory
	sequences, of which >90\% were found to be constituted by two different
	molecular forms. A subset of 159 genes with potentially severe mutations
	in either cis or trans configurations exemplified in particular the
	role of phase for gene function, disease, and clinical interpretation
	of personal genomes (e.g., BRCA1). Extended genomic regions harboring
	manifold combinations of physically and/or functionally related genes
	and regulatory elements were resolved into their underlying "haploid
	landscapes," which may define the functional genome. Moreover, the
	majority of genes and functional sequences were found to contain
	individual or rare SNPs, which cannot be phased from population data
	alone, emphasizing the importance of molecular phasing for characterizing
	a genome in its molecular individuality. Our work provides the foundation
	to understand that the distinction of molecular haplotypes is essential
	to resolve the (inherently individual) biology of genes, genomes,
	and disease, establishing a reference point for "phase-sensitive"
	personal genomics. MP1's annotated haploid genomes are available
	as a public resource.},
  
  file = {main:Suk2011.pdf:PDF},
  institution = {Department of Vertebrate Genomics, Max Planck Institute for Molecular
	Genetics, 14195 Berlin, Germany.},
  keywords = {Female; Genome, Human; Genomics; Haplotypes; High-Throughput Nucleotide
	Sequencing; Humans; INDEL Mutation; Male; Middle Aged; Polymorphism,
	Single Nucleotide; Sequence Analysis, DNA},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {gr.125047.111},
  pmid = {21813624},
  timestamp = {2012.03.07},
}

@ARTICLE{Tuzun2004,
  author = {Eray Tuzun and Jeffrey A Bailey and Evan E Eichler},
  title = {Recent segmental duplications in the working draft assembly of the
	brown Norway rat.},
  journal = {Genome Res},
  year = {2004},
  volume = {14},
  pages = {493--506},
  month = {Apr},
  abstract = {We assessed the content, structure, and distribution of segmental
	duplications (> or =90\% sequence identity, > or =5 kb length) within
	the published version of the Rattus norvegicus genome assembly (v.3.1).
	The overall fraction of duplicated sequence within the rat assembly
	(2.92\%) is greater than that of the mouse (1\%-1.2\%) but significantly
	less than that of human ( approximately 5\%). Duplications were nonuniformly
	distributed, occurring predominantly as tandem and tightly clustered
	intrachromosomal duplications. Regions containing extensive interchromosomal
	duplications were observed, particularly within subtelomeric and
	pericentromeric regions. We identified 41 discrete genomic regions
	greater than 1 Mb in size, termed "duplication blocks." These appear
	to have been the target of extensive duplication over millions of
	years of evolution. Gene content within duplicated regions ( approximately
	1\%) was lower than expected based on the genome representation.
	Interestingly, sequence contigs lacking chromosome assignment ("the
	unplaced chromosome") showed a marked enrichment for segmental duplication
	(45\% of 75.2 Mb), indicating that segmental duplications have been
	problematic for sequence and assembly of the rat genome. Further
	targeted efforts are required to resolve the organization and complexity
	of these regions.},
  
  institution = {Department of Genetics, Center for Computational Genomics, Case Western
	Reserve University School of Medicine and University Hospitals of
	Cleveland, Cleveland, Ohio 44106, USA.},
  keywords = {Animals; Base Composition, genetics; Chromosomes, genetics; Computational
	Biology, methods/statistics /&/ numerical data; Contig Mapping, methods/statistics
	/&/ numerical data; Gene Conversion, genetics; Gene Duplication;
	Genes, genetics; Genome; Rats; Rats, Inbred BN, genetics},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {14/4/493},
  pmid = {15059990},
  timestamp = {2011.07.19},
}

@ARTICLE{Tuzun2005,
  author = {Eray Tuzun and Andrew J Sharp and Jeffrey A Bailey and Rajinder Kaul
	and V. Anne Morrison and Lisa M Pertz and Eric Haugen and Hillary
	Hayden and Donna Albertson and Daniel Pinkel and Maynard V Olson
	and Evan E Eichler},
  title = {Fine-scale structural variation of the human genome.},
  journal = {Nat Genet},
  year = {2005},
  volume = {37},
  pages = {727--732},
  month = {Jul},
  abstract = {Inversions, deletions and insertions are important mediators of disease
	and disease susceptibility. We systematically compared the human
	genome reference sequence with a second genome (represented by fosmid
	paired-end sequences) to detect intermediate-sized structural variants
	>8 kb in length. We identified 297 sites of structural variation:
	139 insertions, 102 deletions and 56 inversion breakpoints. Using
	combined literature, sequence and experimental analyses, we validated
	112 of the structural variants, including several that are of biomedical
	relevance. These data provide a fine-scale structural variation map
	of the human genome and the requisite sequence precision for subsequent
	genetic studies of human disease.},
  
  file = {main:Tuzun2005.pdf:PDF;supp_note:Tuzun2005-supp.pdf:PDF},
  institution = {Department of Genome Sciences, University of Washington School of
	Medicine, 1705 NE Pacific Street, Seattle, Washington 98195, USA.
	eee@gs.washington.edu},
  keywords = {Base Pairing; Cell Line, Tumor; Computational Biology; Genome, Human;
	Genomic Instability; Humans; Mutation; Oligonucleotide Array Sequence
	Analysis; Polymorphism, Genetic; Reference Values; Sequence Analysis,
	DNA},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {ng1562},
  pmid = {15895083},
  timestamp = {2011.07.19},
}

@ARTICLE{ukkonen,
  author = {Esko Ukkonen},
  title = {Finding approximate patterns in strings},
  journal = {Journal of Algorithms},
  year = {1985}
}

@ARTICLE{Ventura2007,
  author = {Mario Ventura and Francesca Antonacci and Maria Francesca Cardone
	and Roscoe Stanyon and Pietro D'Addabbo and Angelo Cellamare and
	L. James Sprague and Evan E Eichler and Nicoletta Archidiacono and
	Mariano Rocchi},
  title = {Evolutionary formation of new centromeres in macaque.},
  journal = {Science},
  year = {2007},
  volume = {316},
  pages = {243--246},
  month = {Apr},
  abstract = {A systematic fluorescence in situ hybridization comparison of macaque
	and human synteny organization disclosed five additional macaque
	evolutionary new centromeres (ENCs) for a total of nine ENCs. To
	understand the dynamics of ENC formation and progression, we compared
	the ENC of macaque chromosome 4 with the human orthologous region,
	at 6q24.3, that conserves the ancestral genomic organization. A 250-kilobase
	segment was extensively duplicated around the macaque centromere.
	These duplications were strictly intrachromosomal. Our results suggest
	that novel centromeres may trigger only local duplication activity
	and that the absence of genes in the seeding region may have been
	important in ENC maintenance and progression.},
  
  owner = {calkan},
  pii = {316/5822/243},
  pmid = {17431171},
  timestamp = {2007.04.17},
}

@ARTICLE{Ventura2011,
  author = {Mario Ventura and Claudia R Catacchio and Can Alkan and Tomas Marques-Bonet
    and Saba Sajjadian and Tina A Graves and Fereydoun Hormozdiari and
    Arcadi Navarro and Maika Malig and Carl Baker and Choli Lee and Emily
    H Turner and Lin Chen and Jeffrey M Kidd and Nicoletta Archidiacono
    and Jay Shendure and Richard K Wilson and Evan E Eichler},
  title = {Gorilla genome structural variation reveals evolutionary parallelisms
	with chimpanzee.},
  journal = {Genome Res},
  year = {2011},
  volume = {21},
  pages = {1640--1649},
  month = {Oct},
  abstract = {Structural variation has played an important role in the evolutionary
	restructuring of human and great ape genomes. Recent analyses have
	suggested that the genomes of chimpanzee and human have been particularly
	enriched for this form of genetic variation. Here, we set out to
	assess the extent of structural variation in the gorilla lineage
	by generating 10-fold genomic sequence coverage from a western lowland
	gorilla and integrating these data into a physical and cytogenetic
	framework of structural variation. We discovered and validated over
	7665 structural changes within the gorilla lineage, including sequence
	resolution of inversions, deletions, duplications, and mobile element
	insertions. A comparison with human and other ape genomes shows that
	the gorilla genome has been subjected to the highest rate of segmental
	duplication. We show that both the gorilla and chimpanzee genomes
	have experienced independent yet convergent patterns of structural
	mutation that have not occurred in humans, including the formation
	of subtelomeric heterochromatic caps, the hyperexpansion of segmental
	duplications, and bursts of retroviral integrations. Our analysis
	suggests that the chimpanzee and gorilla genomes are structurally
	more derived than either orangutan or human genomes.},
  
  file = {main:Ventura2011.pdf:PDF;supp_figs_tabs:Ventura2011supp.pdf:PDF;supp_legends:Ventura2011suppLegends.pdf:PDF;supp:Ventura2011suppMaterial.pdf:PDF},
  institution = {Department of Genome Sciences, University of Washington School of
	Medicine, Seattle, Washington 98195, USA.},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {gr.124461.111},
  pmid = {21685127},
  timestamp = {2011.11.08},
}

@ARTICLE{Volik2006,
  author = {Stanislav Volik and Benjamin J Raphael and Guiqing Huang and Michael
	R Stratton and Graham Bignel and John Murnane and John H Brebner
	and Krystyna Bajsarowicz and Pamela L Paris and Quanzhou Tao and
	David Kowbel and Anna Lapuk and Dmitri A Shagin and Irina A Shagina
	and Joe W Gray and Jan-Fang Cheng and Pieter J de Jong and Pavel
	Pevzner and Colin Collins},
  title = {Decoding the fine-scale structure of a breast cancer genome and transcriptome.},
  journal = {Genome Res},
  year = {2006},
  volume = {16},
  pages = {394--404},
  month = {Mar},
  abstract = {A comprehensive understanding of cancer is predicated upon knowledge
	of the structure of malignant genomes underlying its many variant
	forms and the molecular mechanisms giving rise to them. It is well
	established that solid tumor genomes accumulate a large number of
	genome rearrangements during tumorigenesis. End Sequence Profiling
	(ESP) maps and clones genome breakpoints associated with all types
	of genome rearrangements elucidating the structural organization
	of tumor genomes. Here we extend the ESP methodology in several directions
	using the breast cancer cell line MCF-7. First, targeted ESP is applied
	to multiple amplified loci, revealing a complex process of rearrangement
	and co-amplification in these regions reminiscent of breakage/fusion/bridge
	cycles. Second, genome breakpoints identified by ESP are confirmed
	using a combination of {DNA} sequencing and PCR. Third, in vitro
	functional studies assign biological function to a rearranged tumor
	{BAC} clone, demonstrating that it encodes anti-apoptotic activity.
	Finally, ESP is extended to the transcriptome identifying four novel
	fusion transcripts and providing evidence that expression of fusion
	genes may be common in tumors. These results demonstrate the distinct
	advantages of ESP including: (1) the ability to detect all types
	of rearrangements and copy number changes; (2) straightforward integration
	of ESP data with the annotated genome sequence; (3) immortalization
	of the genome; (4) ability to generate tumor-specific reagents for
	in vitro and in vivo functional studies. Given these properties,
	ESP could play an important role in a tumor genome project.},
  
  keywords = {Breast Neoplasms; Cell Line, Tumor; Chromosomes, Artificial, Bacterial;
	Chromosomes, Human; Female; Gene Expression Profiling; Genome, Human;
	Humans; In Situ Hybridization, Fluorescence; Molecular Sequence Data;
	Polymerase Chain Reaction; Reproducibility of Results; Sequence Analysis,
	{DNA}; Transcription, Genetic},
  owner = {calkan},
  pii = {gr.4247306},
  pmid = {16461635},
  timestamp = {2007.05.04},
}

@ARTICLE{genome_sequence_1,
  author = {Wang, J. and et al.},
  title = {The diploid genome sequence of an Asian individual},
  year = {2008},
  institution = {Nature}
}

@ARTICLE{Wang2008,
  author = {Jun Wang and Wei Wang and Ruiqiang Li and Yingrui Li and Geng Tian
	and Laurie Goodman and Wei Fan and Junqing Zhang and Jun Li and Juanbin
	Zhang and Yiran Guo and Binxiao Feng and Heng Li and Yao Lu and Xiaodong
	Fang and Huiqing Liang and Zhenglin Du and Dong Li and Yiqing Zhao
	and Yujie Hu and Zhenzhen Yang and Hancheng Zheng and Ines Hellmann
	and Michael Inouye and John Pool and Xin Yi and Jing Zhao and Jinjie
	Duan and Yan Zhou and Junjie Qin and Lijia Ma and Guoqing Li and
	Zhentao Yang and Guojie Zhang and Bin Yang and Chang Yu and Fang
	Liang and Wenjie Li and Shaochuan Li and Dawei Li and Peixiang Ni
	and Jue Ruan and Qibin Li and Hongmei Zhu and Dongyuan Liu and Zhike
	Lu and Ning Li and Guangwu Guo and Jianguo Zhang and Jia Ye and Lin
	Fang and Qin Hao and Quan Chen and Yu Liang and Yeyang Su and A.
	San and Cuo Ping and Shuang Yang and Fang Chen and Li Li and Ke Zhou
	and Hongkun Zheng and Yuanyuan Ren and Ling Yang and Yang Gao and
	Guohua Yang and Zhuo Li and Xiaoli Feng and Karsten Kristiansen and
	Gane Ka-Shu Wong and Rasmus Nielsen and Richard Durbin and Lars Bolund
	and Xiuqing Zhang and Songgang Li and Huanming Yang and Jian Wang},
  title = {The diploid genome sequence of an {Asian} individual.},
  journal = {Nature},
  year = {2008},
  volume = {456},
  pages = {60--65},
  month = {Nov},
  abstract = {Here we present the first diploid genome sequence of an Asian individual.
	The genome was sequenced to 36-fold average coverage using massively
	parallel sequencing technology. We aligned the short reads onto the
	NCBI human reference genome to 99.97\% coverage, and guided by the
	reference genome, we used uniquely mapped reads to assemble a high-quality
	consensus sequence for 92\% of the Asian individual's genome. We
	identified approximately 3 million single-nucleotide polymorphisms
	(SNPs) inside this region, of which 13.6\% were not in the dbSNP
	database. Genotyping analysis showed that SNP identification had
	high accuracy and consistency, indicating the high sequence quality
	of this assembly. We also carried out heterozygote phasing and haplotype
	prediction against HapMap CHB and JPT haplotypes (Chinese and Japanese,
	respectively), sequence comparison with the two available individual
	genomes (J. D. Watson and J. C. Venter), and structural variation
	identification. These variations were considered for their potential
	biological impact. Our sequence data and analyses demonstrate the
	potential usefulness of next-generation sequencing technologies for
	personal genomics.},
  
  institution = {Beijing Genomics Institute at Shenzhen, Shenzhen 518000, China. wangj@genomics.org.cn},
  keywords = {Alleles; Animals; Asian Continental Ancestry Group; Consensus Sequence;
	Databases, Genetic; Diploidy; Genetic Predisposition to Disease;
	Genome, Human; Genomics; Haplotypes; Humans; Internet; Pan troglodytes;
	Phenotype; Polymorphism, Single Nucleotide; Sensitivity and Specificity;
	Sequence Alignment},
  owner = {calkan},
  pii = {nature07484},
  pmid = {18987735},
  timestamp = {2009.01.12},
}

@ARTICLE{Wang2007,
  author = {Kai Wang and Mingyao Li and Dexter Hadley and Rui Liu and Joseph
	Glessner and Struan F A Grant and Hakon Hakonarson and Maja Bucan},
  title = {{PennCNV}: an integrated hidden {Markov} model designed for high-resolution
	copy number variation detection in whole-genome {SNP} genotyping
	data.},
  journal = {Genome Res},
  year = {2007},
  volume = {17},
  pages = {1665--1674},
  month = {Nov},
  abstract = {Comprehensive identification and cataloging of copy number variations
	(CNVs) is required to provide a complete view of human genetic variation.
	The resolution of CNV detection in previous experimental designs
	has been limited to tens or hundreds of kilobases. Here we present
	PennCNV, a hidden Markov model (HMM) based approach, for kilobase-resolution
	detection of CNVs from Illumina high-density SNP genotyping data.
	This algorithm incorporates multiple sources of information, including
	total signal intensity and allelic intensity ratio at each SNP marker,
	the distance between neighboring SNPs, the allele frequency of SNPs,
	and the pedigree information where available. We applied PennCNV
	to genotyping data generated for 112 HapMap individuals; on average,
	we detected approximately 27 CNVs for each individual with a median
	size of approximately 12 kb. Excluding common rearrangements in lymphoblastoid
	cell lines, the fraction of CNVs in offspring not detected in parents
	(CNV-NDPs) was 3.3\%. Our results demonstrate the feasibility of
	whole-genome fine-mapping of CNVs via high-density SNP genotyping.},
  
  institution = {Department of Genetics, University of Pennsylvania, Philadelphia,
	Pennsylvania 19104, USA.},
  keywords = {Gene Dosage; Genetic Variation; Genome, Human; Genotype; Humans; Markov
	Chains; Models, Statistical; Polymorphism, Single Nucleotide},
  owner = {calkan},
  pii = {gr.6861907},
  pmid = {17921354},
  timestamp = {2009.09.19},
}

@ARTICLE{Warren2007,
  author = {René L Warren and Granger G Sutton and Steven J M Jones and Robert
	A Holt},
  title = {Assembling millions of short {DNA} sequences using {SSAKE}},
  journal = {Bioinformatics},
  year = {2007},
  volume = {23},
  pages = {500--501},
  month = {Feb},
  abstract = {Novel DNA sequencing technologies with the potential for up to three
	orders magnitude more sequence throughput than conventional Sanger
	sequencing are emerging. The instrument now available from Solexa
	Ltd, produces millions of short DNA sequences of 25 nt each. Due
	to ubiquitous repeats in large genomes and the inability of short
	sequences to uniquely and unambiguously characterize them, the short
	read length limits applicability for de novo sequencing. However,
	given the sequencing depth and the throughput of this instrument,
	stringent assembly of highly identical sequences can be achieved.
	We describe SSAKE, a tool for aggressively assembling millions of
	short nucleotide sequences by progressively searching through a prefix
	tree for the longest possible overlap between any two sequences.
	SSAKE is designed to help leverage the information from short sequence
	reads by stringently assembling them into contiguous sequences that
	can be used to characterize novel sequencing targets. Availability:
	http://www.bcgsc.ca/bioinfo/software/ssake.},
  
  keywords = {Algorithms; Base Sequence; Chromosome Mapping; Contig Mapping; Molecular
	Sequence Data; Sequence Analysis, DNA; Software},
  owner = {calkan},
  pii = {btl629},
  pmid = {17158514},
  timestamp = {2008.03.02},
}

@ARTICLE{razers,
  author = {David Weese and Anne-Katrin Emde and Tobias Rausch and Andreas Döring
	and Knut Reinert},
  title = {RazerS—fast read mapping with sensitivity control},
  journal = {Genome Research},
  year = {2009},
  volume = {19},
  pages = {1646--1654},
}

@ARTICLE{Wheeler2008,
  author = {David A Wheeler and Maithreyan Srinivasan and Michael Egholm and
	Yufeng Shen and Lei Chen and Amy McGuire and Wen He and Yi-Ju Chen
	and Vinod Makhijani and G. Thomas Roth and Xavier Gomes and Karrie
	Tartaro and Faheem Niazi and Cynthia L Turcotte and Gerard P Irzyk
	and James R Lupski and Craig Chinault and Xing-zhi Song and Yue Liu
	and Ye Yuan and Lynne Nazareth and Xiang Qin and Donna M Muzny and
	Marcel Margulies and George M Weinstock and Richard A Gibbs and Jonathan
	M Rothberg},
  title = {The complete genome of an individual by massively parallel {DNA}
	sequencing.},
  journal = {Nature},
  year = {2008},
  volume = {452},
  pages = {872--876},
  month = {Apr},
  abstract = {The association of genetic variation with disease and drug response,
	and improvements in nucleic acid technologies, have given great optimism
	for the impact of 'genomic medicine'. However, the formidable size
	of the diploid human genome, approximately 6 gigabases, has prevented
	the routine application of sequencing methods to deciphering complete
	individual human genomes. To realize the full potential of genomics
	for human health, this limitation must be overcome. Here we report
	the DNA sequence of a diploid genome of a single individual, James
	D. Watson, sequenced to 7.4-fold redundancy in two months using massively
	parallel sequencing in picolitre-size reaction vessels. This sequence
	was completed in two months at approximately one-hundredth of the
	cost of traditional capillary electrophoresis methods. Comparison
	of the sequence to the reference genome led to the identification
	of 3.3 million single nucleotide polymorphisms, of which 10,654 cause
	amino-acid substitution within the coding sequence. In addition,
	we accurately identified small-scale (2-40,000 base pair (bp)) insertion
	and deletion polymorphism as well as copy number variation resulting
	in the large-scale gain and loss of chromosomal segments ranging
	from 26,000 to 1.5 million base pairs. Overall, these results agree
	well with recent results of sequencing of a single individual by
	traditional methods. However, in addition to being faster and significantly
	less expensive, this sequencing technology avoids the arbitrary loss
	of genomic sequences inherent in random shotgun sequencing by bacterial
	cloning because it amplifies DNA in a cell-free system. As a result,
	we further demonstrate the acquisition of novel human sequence, including
	novel genes not previously identified by traditional genomic sequencing.
	This is the first genome sequenced by next-generation technologies.
	Therefore it is a pilot for the future challenges of 'personalized
	genome sequencing'.},
  
  file = {Published version:Wheeler2008.pdf:PDF},
  institution = {Human Genome Sequencing Center, Baylor College of Medicine, One Baylor
	Plaza, Houston, Texas 77030, USA.},
  keywords = {Alleles; Computational Biology; Genetic Predisposition to Disease;
	Genetic Variation; Genome, Human; Genomics; Genotype; Humans; Individuality;
	Male; Oligonucleotide Array Sequence Analysis; Polymorphism, Single
	Nucleotide; Reproducibility of Results; Sensitivity and Specificity;
	Sequence Alignment; Sequence Analysis, DNA; Software},
  owner = {calkan},
  pii = {nature06884},
  pmid = {18421352},
  timestamp = {2009.01.12},
}

@ARTICLE{genome_sequence_3,
  author = {Wheeler, D.A. and et al.},
  title = {The complete genome of an individual by massively parallel DNA sequencing},
  year = {2008},
  institution = {Nature}
}

@ARTICLE{Yang2007,
  author = {Yan Yang and Erwin K Chung and Yee Ling Wu and Stephanie L Savelli
	and Haikady N Nagaraja and Bi Zhou and Maddie Hebert and Karla N
	Jones and Yaoling Shu and Kathryn Kitzmiller and Carol A Blanchong
	and Kim L McBride and Gloria C Higgins and Robert M Rennebohm and
	Robert R Rice and Kevin V Hackshaw and Robert A S Roubey and Jennifer
	M Grossman and Betty P Tsao and Daniel J Birmingham and Brad H Rovin
	and Lee A Hebert and C. Yung Yu},
  title = {Gene copy-number variation and associated polymorphisms of complement
	component {C4} in human systemic lupus erythematosus ({SLE}): low
	copy number is a risk factor for and high copy number is a protective
	factor against {SLE} susceptibility in {E}uropean {A}mericans.},
  journal = {Am J Hum Genet},
  year = {2007},
  volume = {80},
  pages = {1037--1054},
  month = {Jun},
  abstract = {Interindividual gene copy-number variation (CNV) of complement component
	C4 and its associated polymorphisms in gene size (long and short)
	and protein isotypes (C4A and C4B) probably lead to different susceptibilities
	to autoimmune disease. We investigated the C4 gene CNV in 1,241 European
	Americans, including patients with systemic lupus erythematosus (SLE),
	their first-degree relatives, and unrelated healthy subjects, by
	definitive genotyping and phenotyping techniques. The gene copy number
	(GCN) varied from 2 to 6 for total C4, from 0 to 5 for C4A, and from
	0 to 4 for C4B. Four copies of total C4, two copies of C4A, and two
	copies of C4B were the most common GCN counts, but each constituted
	only between one-half and three-quarters of the study populations.
	Long C4 genes were strongly correlated with C4A (R=0.695; P<.0001).
	Short C4 genes were correlated with C4B (R=0.437; P<.0001). In comparison
	with healthy subjects, patients with SLE clearly had the GCN of total
	C4 and C4A shifting to the lower side. The risk of SLE disease susceptibility
	significantly increased among subjects with only two copies of total
	C4 (patients 9.3\%; unrelated controls 1.5\%; odds ratio [OR] = 6.514;
	P=.00002) but decreased in those with > or =5 copies of C4 (patients
	5.79\%; controls 12\%; OR=0.466; P=.016). Both zero copies (OR=5.267;
	P=.001) and one copy (OR=1.613; P=.022) of C4A were risk factors
	for SLE, whereas > or =3 copies of C4A appeared to be protective
	(OR=0.574; P=.012). Family-based association tests suggested that
	a specific haplotype with a single short C4B in tight linkage disequilibrium
	with the -308A allele of TNFA was more likely to be transmitted to
	patients with SLE. This work demonstrates how gene CNV and its related
	polymorphisms are associated with the susceptibility to a human complex
	disease.},
  
  institution = {Center for Molecular and Human Genetics, Columbus Children's Research
	Institute, Columbus, OH 43205, USA.},
  keywords = {Adult; Alleles; Case-Control Studies; Cohort Studies; Complement C4,
	genetics; Disease Susceptibility; European Continental Ancestry Group,
	genetics; Female; Gene Dosage; Gene Frequency; Genetic Variation;
	Genetics, Population; Haplotypes; Humans; Lupus Erythematosus, Systemic,
	genetics/immunology; Male; Middle Aged; Polymorphism, Genetic; Reproducibility
	of Results; Risk Factors},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {S0002-9297(07)61023-4},
  pmid = {17503323},
  timestamp = {2010.09.15},
}

@ARTICLE{Ye2009,
  author = {Kai Ye and Marcel H Schulz and Quan Long and Rolf Apweiler and Zemin
	Ning},
  title = {Pindel: a pattern growth approach to detect break points of large
	deletions and medium sized insertions from paired-end short reads.},
  journal = {Bioinformatics},
  year = {2009},
  volume = {25},
  pages = {2865--2871},
  month = {Nov},
  abstract = {There is a strong demand in the genomic community to develop effective
	algorithms to reliably identify genomic variants. Indel detection
	using next-gen data is difficult and identification of long structural
	variations is extremely challenging.We present Pindel, a pattern
	growth approach, to detect breakpoints of large deletions and medium-sized
	insertions from paired-end short reads. We use both simulated reads
	and real data to demonstrate the efficiency of the computer program
	and accuracy of the results.The binary code and a short user manual
	can be freely downloaded from http://www.ebi.ac.uk/ approximately
	kye/pindel/.k.ye@lumc.nl; zn1@sanger.ac.uk.},
  
  file = {main:Ye2009.pdf:PDF},
  institution = {EMBL Outstation European Bioinformatics Institute, Wellcome Trust
	Genome Campus, Hinxton, Cambridge, UK. k.ye@lumc.nl},
  keywords = {Algorithms; Chromosome Breakpoints; Computational Biology, methods;
	DNA Breaks; Genome; INDEL Mutation; Sequence Analysis, DNA; Software},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {btp394},
  pmid = {19561018},
  timestamp = {2011.07.19},
}

@ARTICLE{Yoon2009,
  author = {Seungtai Yoon and Zhenyu Xuan and Vladimir Makarov and Kenny Ye and
	Jonathan Sebat},
  title = {Sensitive and accurate detection of copy number variants using read
	depth of coverage.},
  journal = {Genome Res},
  year = {2009},
  volume = {19},
  pages = {1586--1592},
  month = {Sep},
  abstract = {Methods for the direct detection of copy number variation (CNV) genome-wide
	have become effective instruments for identifying genetic risk factors
	for disease. The application of next-generation sequencing platforms
	to genetic studies promises to improve sensitivity to detect CNVs
	as well as inversions, indels, and SNPs. New computational approaches
	are needed to systematically detect these variants from genome sequence
	data. Existing sequence-based approaches for CNV detection are primarily
	based on paired-end read mapping (PEM) as reported previously by
	Tuzun et al. and Korbel et al. Due to limitations of the PEM approach,
	some classes of CNVs are difficult to ascertain, including large
	insertions and variants located within complex genomic regions. To
	overcome these limitations, we developed a method for CNV detection
	using read depth of coverage. Event-wise testing (EWT) is a method
	based on significance testing. In contrast to standard segmentation
	algorithms that typically operate by performing likelihood evaluation
	for every point in the genome, EWT works on intervals of data points,
	rapidly searching for specific classes of events. Overall false-positive
	rate is controlled by testing the significance of each possible event
	and adjusting for multiple testing. Deletions and duplications detected
	in an individual genome by EWT are examined across multiple genomes
	to identify polymorphism between individuals. We estimated error
	rates using simulations based on real data, and we applied EWT to
	the analysis of chromosome 1 from paired-end shotgun sequence data
	(30x) on five individuals. Our results suggest that analysis of read
	depth is an effective approach for the detection of CNVs, and it
	captures structural variants that are refractory to established PEM-based
	methods.},
  
  institution = {Cold Spring Harbor Laboratory, Cold Spring Harbor, New York 11724,
	USA.},
  owner = {calkan},
  pii = {gr.092981.109},
  pmid = {19657104},
  timestamp = {2009.09.22},
}

@ARTICLE{Zerbino2010,
  author = {Daniel R Zerbino},
  title = {Using the Velvet de novo assembler for short-read sequencing technologies.},
  journal = {Curr Protoc Bioinformatics},
  year = {2010},
  volume = {Chapter 11},
  pages = {Unit 11.5},
  month = {Sep},
  abstract = {The Velvet de novo assembler was designed to build contigs and eventually
	scaffolds from short-read sequencing data. This protocol describes
	how to use Velvet, interpret its output, and tune its parameters
	for optimal results. It also covers practical issues such as configuration,
	using the VelvetOptimiser routine, and processing colorspace data.},
  
  institution = {Center for Biomolecular Science and Engineering, Santa Cruz, California,
	USA.},
  keywords = {Base Sequence; Sequence Analysis, DNA, methods; Software},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pmid = {20836074},
  timestamp = {2012.03.05},
}

@ARTICLE{Zerbino2008,
  author = {Daniel R Zerbino and Ewan Birney},
  title = {Velvet: algorithms for de novo short read assembly using de Bruijn
	graphs.},
  journal = {Genome Res},
  year = {2008},
  volume = {18},
  pages = {821--829},
  month = {May},
  abstract = {We have developed a new set of algorithms, collectively called "Velvet,"
	to manipulate de Bruijn graphs for genomic sequence assembly. A de
	Bruijn graph is a compact representation based on short words (k-mers)
	that is ideal for high coverage, very short read (25-50 bp) data
	sets. Applying Velvet to very short reads and paired-ends information
	only, one can produce contigs of significant length, up to 50-kb
	N50 length in simulations of prokaryotic data and 3-kb N50 on simulated
	mammalian BACs. When applied to real Solexa data sets without read
	pairs, Velvet generated contigs of approximately 8 kb in a prokaryote
	and 2 kb in a mammalian BAC, in close agreement with our simulated
	results without read-pair information. Velvet represents a new approach
	to assembly that can leverage very short reads in combination with
	read pairs to produce useful assemblies.},
  
  file = {main:Zerbino2008.pdf:PDF},
  institution = {EMBL-European Bioinformatics Institute, Wellcome Trust Genome Campus,
	Hinxton, Cambridge CB10 1SD, United Kingdom.},
  keywords = {Algorithms; Animals; Chromosomes, Artificial, Bacterial; Computational
	Biology, methods; Computer Simulation; Genome, Bacterial; Genome,
	Human; Genomics; Humans; Mammals, genetics; Sequence Analysis, DNA,
	methods/standards; Streptococcus, genetics},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pii = {gr.074492.107},
  pmid = {18349386},
  timestamp = {2012.03.05},
}

@ARTICLE{Zerbino2009,
  author = {Daniel R Zerbino and Gayle K McEwen and Elliott H Margulies and Ewan
	Birney},
  title = {Pebble and rock band: heuristic resolution of repeats and scaffolding
	in the velvet short-read de novo assembler.},
  journal = {PLoS One},
  year = {2009},
  volume = {4},
  pages = {e8407},
  abstract = {Despite the short length of their reads, micro-read sequencing technologies
	have shown their usefulness for de novo sequencing. However, especially
	in eukaryotic genomes, complex repeat patterns are an obstacle to
	large assemblies.We present a novel heuristic algorithm, Pebble,
	which uses paired-end read information to resolve repeats and scaffold
	contigs to produce large-scale assemblies. In simulations, we can
	achieve weighted median scaffold lengths (N50) of above 1 Mbp in
	Bacteria and above 100 kbp in more complex organisms. Using real
	datasets we obtained a 96 kbp N50 in Pseudomonas syringae and a unique
	147 kbp scaffold of a ferret BAC clone. We also present an efficient
	algorithm called Rock Band for the resolution of repeats in the case
	of mixed length assemblies, where different sequencing platforms
	are combined to obtain a cost-effective assembly.These algorithms
	extend the utility of short read only assemblies into large complex
	genomes. They have been implemented and made available within the
	open-source Velvet short-read de novo assembler.},
  
  file = {main:Zerbino2009.pdf:PDF},
  institution = {European Bioinformatics Institute, Wellcome Trust Genome Campus,
	Hinxton, Cambridge, UK. zerbino@ebi.ac.uk},
  keywords = {Algorithms; Animals; Chromosomes, Artificial, Bacterial, genetics;
	Computer Simulation; Ferrets, genetics; Pseudomonas syringae, genetics;
	Repetitive Sequences, Nucleic Acid, genetics; Sequence Analysis,
	DNA, instrumentation/methods},
  language = {eng},
  medline-pst = {epublish},
  owner = {calkan},
  pmid = {20027311},
  timestamp = {2012.03.05},
}

@ARTICLE{Zhang2000,
  author = {Z. Zhang and S. Schwartz and L. Wagner and W. Miller},
  title = {A greedy algorithm for aligning {DNA} sequences.},
  journal = {J Comput Biol},
  year = {2000},
  volume = {7},
  pages = {203--214},
  abstract = {For aligning DNA sequences that differ only by sequencing errors,
	or by equivalent errors from other sources, a greedy algorithm can
	be much faster than traditional dynamic programming approaches and
	yet produce an alignment that is guaranteed to be theoretically optimal.
	We introduce a new greedy alignment algorithm with particularly good
	performance and show that it computes the same alignment as does
	a certain dynamic programming algorithm, while executing over 10
	times faster on appropriate data. An implementation of this algorithm
	is currently used in a program that assembles the UniGene database
	at the National Center for Biotechnology Information.},
  
  file = {main:Zhang2000.pdf:PDF},
  keywords = {Algorithms; Biometry; DNA; Databases, Factual; Sequence Alignment;
	Sequence Analysis, DNA; Software},
  owner = {calkan},
  pmid = {10890397},
  timestamp = {2008.03.04},
}

@ARTICLE{Zody2008,
  author = {Michael C Zody and Zhaoshi Jiang and Hon-Chung Fung and Francesca
	Antonacci and LaDeana W Hillier and Maria Francesca Cardone and Tina
	A Graves and Jeffrey M Kidd and Ze Cheng and Amr Abouelleil and Lin
	Chen and John Wallis and Jarret Glasscock and Richard K Wilson and
	Amy Denise Reily and Jaime Duckworth and Mario Ventura and John Hardy
	and Wesley C Warren and Evan E Eichler},
  title = {Evolutionary toggling of the {\it MAPT} 17q21.31 inversion region.},
  journal = {Nat Genet},
  year = {2008},
  volume = {40},
  pages = {1076--1083},
  month = {Sep},
  abstract = {Using comparative sequencing approaches, we investigated the evolutionary
	history of the European-enriched 17q21.31 MAPT inversion polymorphism.
	We present a detailed, BAC-based sequence assembly of the inverted
	human H2 haplotype and compare it to the sequence structure and genetic
	variation of the corresponding 1.5-Mb region for the noninverted
	H1 human haplotype and that of chimpanzee and orangutan. We found
	that inversion of the MAPT region is similarly polymorphic in other
	great ape species, and we present evidence that the inversions occurred
	independently in chimpanzees and humans. In humans, the inversion
	breakpoints correspond to core duplications with the LRRC37 gene
	family. Our analysis favors the H2 configuration and sequence haplotype
	as the likely great ape and human ancestral state, with inversion
	recurrences during primate evolution. We show that the H2 architecture
	has evolved more extensive sequence homology, perhaps explaining
	its tendency to undergo microdeletion associated with mental retardation
	in European populations.},
  
  file = {main:Zody2008.pdf:PDF},
  institution = {Broad Institute of MIT and Harvard, 7 Cambridge Center, Cambridge,
	Massachusetts 02142, USA.},
  keywords = {Animals; Base Sequence; Chromosome Inversion; Chromosomes, Human,
	Pair 17; Evolution, Molecular; Gene Duplication; Humans; Models,
	Biological; Molecular Sequence Data; Pan troglodytes, genetics; Phylogeny;
	Polymorphism, Genetic; Pongo pygmaeus, genetics; Sequence Analysis,
	DNA; tau Proteins, genetics},
  language = {eng},
  medline-pst = {ppublish},
  owner = {calkan},
  pmid = {19165922},
  timestamp = {2010.09.15},
}

@ARTICLE{Brenner2000,
  author = {Sydney Brenner and Maria Johnson and John Bridgham and George Golda
    and David H. Lloyd and Davida Johnson and Shujun Luo and Sarah McCurdy and
    Michael Foy and Mark Ewan and Rithy Roth and Dave George and Sam Eletr and
    Glenn Albrecht and Eric Vermaas and Steven R. Williams and Keith Moon and
    Timothy Burcham and Michael Pallas and Robert B. DuBridge and James Kirchner
    and Karen Fearon and Jen-i Mao and and Kevin Corcoran},
  title = {Gene expression analysis by massively parallel signature sequencing (MPSS) on microbead arrays},
  journal = {Nat Biotechnol},
  year = {2000},
  volume = {18(6)},
  pages = {630-4},
}

@inproceedings{Turnpenny2005,
 author = {Turnpenny, P. and Ellard, S.},
 booktitle = {Emery's Elements of Medical Genetics, 12th ed},
 year = {2005},
}

@ARTICLE{MUMmer,
  author = {Arthur L. Delcher and Simon Kasif and Robert D. Fleischmann and Jeremy Peterson and Owen White and Steven L. Salzberg},
  year = {1999},
  title = {Alignment of whole genomes},
  journal = {Nucl. Acids Res.},
}

@ARTICLE{bowtie2,
  author = {Ben Langmead and Steven L Salzberg},
  year = {2012},
  title = {Fast gapped-read alignment with Bowtie 2},
  journal = {Nature Method},
  volume = {9},
  pages = {357-359},

}

@ARTICLE{patternhunter,
  author = {Bin Ma and John Tromp 2 and Ming Li 3},
  year = {2002},
  title = {PatternHunter: faster and more sensitive homology search},
  journal = {Bioinformatics},
  volume = {18},
  pages = {440-445},
}

}
